diff --git a/sft_pretrain/Full_smoe_share/added_tokens.json b/sft_pretrain/Full_smoe_share/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/added_tokens.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/config.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9b0c4407eef6bd7d8c22453f95c43fd6ef0981 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/generation_config.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ee7e6577f42f62f80d11074f5a55933840cacd1 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4caede9ae5733c32ecd24135e781992cfcb868fb447f14920739cf350fb05147 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..988243ebc894646d630103ab18d673acc0dccce4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84304984523fa01a179fe1c55db88283a373701b062011c86cc4f7e282e7225 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..828c2a7dccb3cd5e11130d37f9cdbf1ed4a3169f --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cdfa0254f318f437fbb3c7a5550d36658c5004502be0e474b562acb5c0e321b +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3292b5bf6456b16930f15bd2952547294d01f34c --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:758969d1aca7709b6a9a1291e0f135c2ea2b2eb99f583fc17b50c27d301fdd8b +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ae4093fc2eecc3f338358334b179aba19228e8a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b56214bfda3eea6cae756c9a5b2d634dde1241d5d74aa58e5d5eba03771090 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..72986bfd5f3445dd4b82d128644c1866dd2d2036 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a6c57ca3e62ad32d871a090476e7c71186005d82772dcff3eaf268757ecd72 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..19e13b9c89ca5673cc9e2a20d57f3d8709838265 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71d7adc0bd3a72af0f6d694658be8a354d35702d19ae7800d02055fc810e067a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f953af8e2a4c854bc694023b8eae903d1d24095 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/global_step1040/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ec532c783e35b553a9381599a075e232975372e107376e74f0ad1c882c68006 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/latest b/sft_pretrain/Full_smoe_share/checkpoint-1040/latest new file mode 100644 index 0000000000000000000000000000000000000000..f37da78e3c7eee26ebe5f06b54d6621716edb6b9 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/latest @@ -0,0 +1 @@ +global_step1040 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c7ce05e35e85a0e091f631d38d49bef6b863e512 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34dba22b69a56aa97c7de1723fd45754c18f9e12b4af1883d23298c24103d157 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_0.pth b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_1.pth b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_2.pth b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_3.pth b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/special_tokens_map.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer.model b/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer_config.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/trainer_state.json b/sft_pretrain/Full_smoe_share/checkpoint-1040/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..403cbe891b93ea0330c9e6ebbf0233f619506d87 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2342240041697280.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/training_args.bin b/sft_pretrain/Full_smoe_share/checkpoint-1040/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-1040/zero_to_fp32.py b/sft_pretrain/Full_smoe_share/checkpoint-1040/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-1040/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/added_tokens.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/config.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9b0c4407eef6bd7d8c22453f95c43fd6ef0981 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/generation_config.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cf924738f959fec8bbafd99a7b43bee8dadaef6 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82c7e24e353fb4797548c2f38667552325d19ba20a12e64d9b03a8260d3a632d +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6f6d4496411b65bfaa9e1436d89af64b41236ad --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8bae89699b0d69ae3810dcc5c5bd3ad4192d44d39ffe8c47de18f71842a480d +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..3af8323e2b89144e3ee52f6961dde8cef8d545ff --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e3143bdfeca8c4e78b44f4385ad8417eb15cf7665c49671d472967eb0ef5310 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a4b0b3abd81cf1f88946ff5d64b6caa6caba8c6 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f53c9ac846fe4e814120a3caa4a976b2f4cf700826e67414d3cff6f84b74bd77 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..da6484c5a14088cbc1644f958880c678891a426b --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be544caad4f7d502ba56d2238246b839d184c91a92818fcfd4071d17abd38f26 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8240c57edba033c8d96a84e80a9e75330dfaa2a4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca9fa4b23f64c1a7a0cd31a9fb3be7ba760fbdf95afbe68ae87fa5d04b6ce1d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfe2b67cfb0eba6d8b256dc452def88966aae00b --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d445413db3703f2e7f442094f09165e8e623f40059a8ba003a884d09a7b193d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e5a8cb5d08a4ee00c27a3c42c0b46c2690a349a3 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/global_step2080/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3797f5d7708193efa50ce2434c0d8658eabd07e9919b2304807ec6fb4dc989 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/latest b/sft_pretrain/Full_smoe_share/checkpoint-2080/latest new file mode 100644 index 0000000000000000000000000000000000000000..306b989cc55bbad3d1661dff0bcd6923a752cb0a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/latest @@ -0,0 +1 @@ +global_step2080 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6707641c16e283be38cf53b346d6226f13685845 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8321fa56670b18a6cab168f9f7e9c0f10a250188adc1d8128583ea8d60e47e8e +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_0.pth b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_1.pth b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_2.pth b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_3.pth b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/special_tokens_map.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer.model b/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer_config.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/trainer_state.json b/sft_pretrain/Full_smoe_share/checkpoint-2080/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2e778605a424507abde751c9691a562b6e8f722e --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.10615349, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.07166596959521815, + "language_loss": 0.84091032, + "learning_rate": 0.000925888133132719, + "loss": 0.852319, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.34741211, + "step": 1041, + "time_per_iteration": 2.791015148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724521, + "balance_loss_mlp": 1.67225933, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.16089622263247963, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8133496, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.5234375, + "step": 1042, + "time_per_iteration": 4.978717565536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116458, + "balance_loss_mlp": 1.08169639, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.06766738281342395, + "language_loss": 0.80769098, + "learning_rate": 0.0009255613649386244, + "loss": 0.81885552, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.34790039, + "step": 1043, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_mlp": 1.08709943, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.07361728486384381, + "language_loss": 0.78999138, + "learning_rate": 0.0009253977329834838, + "loss": 0.80121642, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.35449219, + "step": 1044, + "time_per_iteration": 2.7036681175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_mlp": 1.07227719, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.08623717161971375, + "language_loss": 0.86596096, + "learning_rate": 0.0009252339358742965, + "loss": 0.87704492, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.36108398, + "step": 1045, + "time_per_iteration": 2.874620199203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118791, + "balance_loss_mlp": 1.08369565, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.06963930913543727, + "language_loss": 0.82984746, + "learning_rate": 0.000925069973674654, + "loss": 0.84103537, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.35107422, + "step": 1046, + "time_per_iteration": 2.628878116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_mlp": 1.07017231, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.07870556033127275, + "language_loss": 0.88610631, + "learning_rate": 0.000924905846448212, + "loss": 0.89716709, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.35913086, + "step": 1047, + "time_per_iteration": 2.747220754623413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0750165, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.10747792176710873, + "language_loss": 0.85372317, + "learning_rate": 0.0009247415542586906, + "loss": 0.86482, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34667969, + "step": 1048, + "time_per_iteration": 2.8556973934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.08285666, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.2214820598260846, + "language_loss": 0.83177209, + "learning_rate": 0.0009245770971698735, + "loss": 0.84296525, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.36450195, + "step": 1049, + "time_per_iteration": 2.9050869941711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132964, + "balance_loss_mlp": 1.09798741, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.08175342307012821, + "language_loss": 0.88327754, + "learning_rate": 0.0009244124752456087, + "loss": 0.89460719, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.34985352, + "step": 1050, + "time_per_iteration": 2.5141613483428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151097, + "balance_loss_mlp": 1.11557305, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.06393011823673703, + "language_loss": 0.85371649, + "learning_rate": 0.0009242476885498081, + "loss": 0.86522746, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.35522461, + "step": 1051, + "time_per_iteration": 2.727687358856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176333, + "balance_loss_mlp": 1.14171457, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.09914193731013146, + "language_loss": 0.80802011, + "learning_rate": 0.0009240827371464474, + "loss": 0.81978351, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.34643555, + "step": 1052, + "time_per_iteration": 2.552121877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191475, + "balance_loss_mlp": 1.15521157, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.1023503287046967, + "language_loss": 0.83863074, + "learning_rate": 0.0009239176210995666, + "loss": 0.85054547, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.36230469, + "step": 1053, + "time_per_iteration": 3.47882342338562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190284, + "balance_loss_mlp": 1.15561819, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.09115683042396579, + "language_loss": 0.93677175, + "learning_rate": 0.0009237523404732695, + "loss": 0.94867456, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34692383, + "step": 1054, + "time_per_iteration": 2.8701720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173476, + "balance_loss_mlp": 1.13838029, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.10782024136876088, + "language_loss": 0.8421399, + "learning_rate": 0.0009235868953317235, + "loss": 0.85387468, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.3515625, + "step": 1055, + "time_per_iteration": 2.8210723400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161281, + "balance_loss_mlp": 1.12682986, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.07346272336072437, + "language_loss": 0.85227096, + "learning_rate": 0.0009234212857391602, + "loss": 0.86388373, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.3449707, + "step": 1056, + "time_per_iteration": 3.2212936878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153084, + "balance_loss_mlp": 1.11727369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.054845505201833546, + "language_loss": 0.89240777, + "learning_rate": 0.000923255511759875, + "loss": 0.90393853, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.3581543, + "step": 1057, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156175, + "balance_loss_mlp": 1.12146115, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.10969304378799022, + "language_loss": 0.84913409, + "learning_rate": 0.000923089573458227, + "loss": 0.86069584, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.34716797, + "step": 1058, + "time_per_iteration": 2.8832740783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.1168946, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.24205150411640483, + "language_loss": 0.83790255, + "learning_rate": 0.0009229234708986392, + "loss": 0.84941626, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.3449707, + "step": 1059, + "time_per_iteration": 2.8837289810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01633401, + "balance_loss_mlp": 1.57885134, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.08953482343612705, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83300292, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.546875, + "step": 1060, + "time_per_iteration": 4.667459011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158699, + "balance_loss_mlp": 1.1247009, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.0736942782322193, + "language_loss": 0.84963936, + "learning_rate": 0.0009225907732636548, + "loss": 0.86122632, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.34033203, + "step": 1061, + "time_per_iteration": 2.7532095909118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_mlp": 1.12954497, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.09512005659435491, + "language_loss": 0.8641578, + "learning_rate": 0.0009224241783174227, + "loss": 0.87580323, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.35009766, + "step": 1062, + "time_per_iteration": 2.683047294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147761, + "balance_loss_mlp": 1.11347604, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.07955707081408017, + "language_loss": 0.85456479, + "learning_rate": 0.0009222574193715802, + "loss": 0.86604244, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.34326172, + "step": 1063, + "time_per_iteration": 2.8293161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_mlp": 1.10474837, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.08617592440024102, + "language_loss": 0.85715151, + "learning_rate": 0.000922090496490869, + "loss": 0.8685447, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.34619141, + "step": 1064, + "time_per_iteration": 2.749298334121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.08865011, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.06572729358097257, + "language_loss": 0.89767212, + "learning_rate": 0.0009219234097400937, + "loss": 0.90891409, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.35595703, + "step": 1065, + "time_per_iteration": 2.8508355617523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_mlp": 1.07175696, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.05918330788086957, + "language_loss": 0.82970631, + "learning_rate": 0.0009217561591841237, + "loss": 0.8407777, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.35400391, + "step": 1066, + "time_per_iteration": 3.3216452598571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102073, + "balance_loss_mlp": 1.06566656, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09526156176010836, + "language_loss": 0.81088316, + "learning_rate": 0.0009215887448878913, + "loss": 0.82190394, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.36401367, + "step": 1067, + "time_per_iteration": 2.596022129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06191611, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.072135210200994, + "language_loss": 0.84963661, + "learning_rate": 0.0009214211669163922, + "loss": 0.86063439, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.37841797, + "step": 1068, + "time_per_iteration": 4.440082311630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096187, + "balance_loss_mlp": 1.05923223, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.07010547570027807, + "language_loss": 0.93398243, + "learning_rate": 0.0009212534253346862, + "loss": 0.94494426, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.36938477, + "step": 1069, + "time_per_iteration": 2.699843406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096083, + "balance_loss_mlp": 1.05912852, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.07799270520419531, + "language_loss": 0.83685625, + "learning_rate": 0.0009210855202078964, + "loss": 0.84781706, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.36962891, + "step": 1070, + "time_per_iteration": 2.5999720096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010932, + "balance_loss_mlp": 1.05810475, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.0723710550133871, + "language_loss": 0.86933672, + "learning_rate": 0.0009209174516012091, + "loss": 0.88026869, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.35131836, + "step": 1071, + "time_per_iteration": 2.503551483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.05794883, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.05962541016594441, + "language_loss": 0.88928151, + "learning_rate": 0.0009207492195798747, + "loss": 0.90020716, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.34667969, + "step": 1072, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094226, + "balance_loss_mlp": 1.05972731, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.06398863953592046, + "language_loss": 0.84846818, + "learning_rate": 0.0009205808242092061, + "loss": 0.85941041, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34521484, + "step": 1073, + "time_per_iteration": 2.644134044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_mlp": 1.06080186, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.06666861242543158, + "language_loss": 0.82488537, + "learning_rate": 0.0009204122655545808, + "loss": 0.83583593, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34277344, + "step": 1074, + "time_per_iteration": 3.3254919052124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.07582152, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.0719401545163873, + "language_loss": 0.81125832, + "learning_rate": 0.0009202435436814388, + "loss": 0.82235849, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.34228516, + "step": 1075, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105303, + "balance_loss_mlp": 1.0707798, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.06775779875999222, + "language_loss": 0.89715004, + "learning_rate": 0.0009200746586552836, + "loss": 0.90820301, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 2.897177219390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_mlp": 1.06869972, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.12065235325240355, + "language_loss": 0.83624744, + "learning_rate": 0.0009199056105416825, + "loss": 0.84727275, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33862305, + "step": 1077, + "time_per_iteration": 3.0771028995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_mlp": 1.07218289, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.06486814220319007, + "language_loss": 0.8622663, + "learning_rate": 0.0009197363994062654, + "loss": 0.8733272, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.33935547, + "step": 1078, + "time_per_iteration": 2.807009696960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112785, + "balance_loss_mlp": 1.07914448, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.06985523034062016, + "language_loss": 0.84313667, + "learning_rate": 0.0009195670253147262, + "loss": 0.85426456, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.33642578, + "step": 1079, + "time_per_iteration": 2.9738564491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114515, + "balance_loss_mlp": 1.0817802, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.09202653272357895, + "language_loss": 0.81912923, + "learning_rate": 0.0009193974883328216, + "loss": 0.8302744, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32739258, + "step": 1080, + "time_per_iteration": 2.639878511428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121501, + "balance_loss_mlp": 1.08721614, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.059797822691547486, + "language_loss": 0.86745334, + "learning_rate": 0.0009192277885263718, + "loss": 0.87866837, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.34326172, + "step": 1081, + "time_per_iteration": 4.060026407241821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.08671248, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.0682125291941454, + "language_loss": 0.86169523, + "learning_rate": 0.0009190579259612602, + "loss": 0.87289995, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33789062, + "step": 1082, + "time_per_iteration": 3.2795815467834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134326, + "balance_loss_mlp": 1.10132933, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.06852391956291448, + "language_loss": 0.86675245, + "learning_rate": 0.000918887900703433, + "loss": 0.87809569, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.33007812, + "step": 1083, + "time_per_iteration": 2.813777208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137242, + "balance_loss_mlp": 1.1025995, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.07184608102087402, + "language_loss": 0.90139276, + "learning_rate": 0.0009187177128188999, + "loss": 0.91276515, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.34667969, + "step": 1084, + "time_per_iteration": 2.4950854778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361857, + "balance_loss_mlp": 1.30883229, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.057507491560350586, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78518397, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.53125, + "step": 1085, + "time_per_iteration": 4.9323132038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.08279717, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.0734883897044225, + "language_loss": 0.85634506, + "learning_rate": 0.000918376849434071, + "loss": 0.86751348, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.34057617, + "step": 1086, + "time_per_iteration": 2.504467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110856, + "balance_loss_mlp": 1.07680964, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07305298195252904, + "language_loss": 0.90630972, + "learning_rate": 0.0009182061740661098, + "loss": 0.91741836, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34057617, + "step": 1087, + "time_per_iteration": 2.5760254859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_mlp": 1.0785315, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05349746945174757, + "language_loss": 0.84760422, + "learning_rate": 0.0009180353363361127, + "loss": 0.85873878, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.34912109, + "step": 1088, + "time_per_iteration": 3.0988333225250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111767, + "balance_loss_mlp": 1.07593286, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.0658577902216117, + "language_loss": 0.81715566, + "learning_rate": 0.0009178643363104044, + "loss": 0.82827336, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.35864258, + "step": 1089, + "time_per_iteration": 3.1410629749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_mlp": 1.07155704, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.10460691940838339, + "language_loss": 0.90569937, + "learning_rate": 0.0009176931740553735, + "loss": 0.91676497, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.35009766, + "step": 1090, + "time_per_iteration": 2.529330253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112911, + "balance_loss_mlp": 1.07698107, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.07113631656774884, + "language_loss": 0.82557011, + "learning_rate": 0.0009175218496374708, + "loss": 0.83669925, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.359375, + "step": 1091, + "time_per_iteration": 3.347742795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110472, + "balance_loss_mlp": 1.07356465, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.08284412758413852, + "language_loss": 0.85813856, + "learning_rate": 0.0009173503631232103, + "loss": 0.86924326, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.36914062, + "step": 1092, + "time_per_iteration": 3.378859758377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.06684804, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.09413161778101656, + "language_loss": 0.81595004, + "learning_rate": 0.0009171787145791691, + "loss": 0.82698447, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.36621094, + "step": 1093, + "time_per_iteration": 3.215574026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_mlp": 1.06214595, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.0806437411167059, + "language_loss": 0.80327773, + "learning_rate": 0.000917006904071987, + "loss": 0.81427377, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.37451172, + "step": 1094, + "time_per_iteration": 2.6117537021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_mlp": 1.06377053, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.08991830585001004, + "language_loss": 0.87576157, + "learning_rate": 0.0009168349316683669, + "loss": 0.88676262, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.36352539, + "step": 1095, + "time_per_iteration": 2.740950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_mlp": 1.06650949, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.06267137937039592, + "language_loss": 0.8218863, + "learning_rate": 0.0009166627974350741, + "loss": 0.83290446, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.35327148, + "step": 1096, + "time_per_iteration": 2.887326240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098665, + "balance_loss_mlp": 1.06206763, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.07019696164219995, + "language_loss": 0.89238816, + "learning_rate": 0.0009164905014389373, + "loss": 0.90337479, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.3659668, + "step": 1097, + "time_per_iteration": 2.7609455585479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105326, + "balance_loss_mlp": 1.06908655, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.06528725154368942, + "language_loss": 0.8638711, + "learning_rate": 0.0009163180437468476, + "loss": 0.87492442, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.36254883, + "step": 1098, + "time_per_iteration": 2.5998973846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096402, + "balance_loss_mlp": 1.06009042, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.06547964129234486, + "language_loss": 0.85908926, + "learning_rate": 0.000916145424425759, + "loss": 0.87005323, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.36303711, + "step": 1099, + "time_per_iteration": 2.6804425716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06601155, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.08063804967749887, + "language_loss": 0.90475744, + "learning_rate": 0.0009159726435426885, + "loss": 0.91577733, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.35986328, + "step": 1100, + "time_per_iteration": 3.1017394065856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_mlp": 1.06499124, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.08023517310436831, + "language_loss": 0.90250683, + "learning_rate": 0.0009157997011647154, + "loss": 0.9135161, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.359375, + "step": 1101, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_mlp": 1.06045425, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05508329212621071, + "language_loss": 0.86001104, + "learning_rate": 0.0009156265973589817, + "loss": 0.87097728, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.36206055, + "step": 1102, + "time_per_iteration": 2.7933261394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097006, + "balance_loss_mlp": 1.06121981, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.06583201442001711, + "language_loss": 0.89802408, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899414, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.35791016, + "step": 1103, + "time_per_iteration": 2.647494316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096343, + "balance_loss_mlp": 1.0598892, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06603869229078199, + "language_loss": 0.87027407, + "learning_rate": 0.0009152799057331156, + "loss": 0.88123751, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.36499023, + "step": 1104, + "time_per_iteration": 3.1623916625976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097231, + "balance_loss_mlp": 1.06134939, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.07161611233178561, + "language_loss": 0.90831178, + "learning_rate": 0.0009151063180475805, + "loss": 0.91928405, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.35913086, + "step": 1105, + "time_per_iteration": 2.5515594482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099591, + "balance_loss_mlp": 1.06516361, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.08899576142412509, + "language_loss": 0.83941323, + "learning_rate": 0.0009149325692034803, + "loss": 0.85040915, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.34472656, + "step": 1106, + "time_per_iteration": 2.561875343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300575, + "balance_loss_mlp": 1.25708735, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.05662804479307553, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80504, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.43554688, + "step": 1107, + "time_per_iteration": 4.880220174789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_mlp": 1.06870413, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.06711298172071122, + "language_loss": 0.87037283, + "learning_rate": 0.0009145845883094678, + "loss": 0.88141322, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.35375977, + "step": 1108, + "time_per_iteration": 3.0598409175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_mlp": 1.06931639, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.06803775359788228, + "language_loss": 0.8464098, + "learning_rate": 0.000914410356394654, + "loss": 0.85746086, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.35839844, + "step": 1109, + "time_per_iteration": 2.776258945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_mlp": 1.06799972, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.052025780444459935, + "language_loss": 0.84733951, + "learning_rate": 0.0009142359635914709, + "loss": 0.85837853, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.35913086, + "step": 1110, + "time_per_iteration": 3.057307243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096278, + "balance_loss_mlp": 1.05996692, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.10914443694781037, + "language_loss": 0.84286684, + "learning_rate": 0.0009140614099676245, + "loss": 0.85382962, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.36328125, + "step": 1111, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.0517633, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.09545242357915729, + "language_loss": 0.82540983, + "learning_rate": 0.0009138866955908821, + "loss": 0.83628869, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.36132812, + "step": 1112, + "time_per_iteration": 2.870765209197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_mlp": 1.06445658, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06321568237144509, + "language_loss": 0.8048408, + "learning_rate": 0.0009137118205290738, + "loss": 0.8158437, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.35864258, + "step": 1113, + "time_per_iteration": 4.381570100784302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_mlp": 1.06091869, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06328361159326604, + "language_loss": 0.89779603, + "learning_rate": 0.0009135367848500924, + "loss": 0.90876651, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.36157227, + "step": 1114, + "time_per_iteration": 2.511164665222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.06034184, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.08987717155463379, + "language_loss": 0.86417669, + "learning_rate": 0.0009133615886218927, + "loss": 0.87514299, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.36303711, + "step": 1115, + "time_per_iteration": 2.7101125717163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089806, + "balance_loss_mlp": 1.05337584, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.07119429557645003, + "language_loss": 0.87869287, + "learning_rate": 0.0009131862319124917, + "loss": 0.88959092, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.36425781, + "step": 1116, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.05648971, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06965010238630005, + "language_loss": 0.83447617, + "learning_rate": 0.0009130107147899691, + "loss": 0.84540606, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.36499023, + "step": 1117, + "time_per_iteration": 2.723092794418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_mlp": 1.05805993, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.055087901571477416, + "language_loss": 0.84983969, + "learning_rate": 0.0009128350373224665, + "loss": 0.8607831, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.36352539, + "step": 1118, + "time_per_iteration": 2.5449509620666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178954, + "balance_loss_mlp": 1.14500344, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.021865185871831474, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82635385, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.33984375, + "step": 1119, + "time_per_iteration": 4.641271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_mlp": 1.06648207, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.07523243301623007, + "language_loss": 0.85678464, + "learning_rate": 0.0009124832016254005, + "loss": 0.86781639, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.36694336, + "step": 1120, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_mlp": 1.06163859, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.07092227494936269, + "language_loss": 0.87677884, + "learning_rate": 0.0009123070435324316, + "loss": 0.88775837, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.36352539, + "step": 1121, + "time_per_iteration": 2.777632236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166186, + "balance_loss_mlp": 1.13337982, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.01899876446696313, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.7904197, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.328125, + "step": 1122, + "time_per_iteration": 4.966520547866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.0522635, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.060329223802114536, + "language_loss": 0.86415493, + "learning_rate": 0.0009119542471995752, + "loss": 0.87504709, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.36938477, + "step": 1123, + "time_per_iteration": 2.8373889923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090311, + "balance_loss_mlp": 1.05438125, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06176848453484022, + "language_loss": 0.81323773, + "learning_rate": 0.0009117776090966554, + "loss": 0.82414079, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.359375, + "step": 1124, + "time_per_iteration": 2.999127149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_mlp": 1.0507102, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.07470238986110685, + "language_loss": 0.86757743, + "learning_rate": 0.0009116008111274899, + "loss": 0.87845105, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.36669922, + "step": 1125, + "time_per_iteration": 3.3534371852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160744, + "balance_loss_mlp": 1.13022673, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.021433456679081614, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80267668, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.3046875, + "step": 1126, + "time_per_iteration": 4.8522608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086571, + "balance_loss_mlp": 1.04975939, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.07895568764354688, + "language_loss": 0.85050654, + "learning_rate": 0.0009112467358650396, + "loss": 0.86137229, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.36816406, + "step": 1127, + "time_per_iteration": 3.157684803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05472374, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.05660039583272807, + "language_loss": 0.86175025, + "learning_rate": 0.0009110694587092192, + "loss": 0.87265825, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.36108398, + "step": 1128, + "time_per_iteration": 2.755575656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.052562, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.077592311143443, + "language_loss": 0.81304091, + "learning_rate": 0.0009108920219620815, + "loss": 0.82392299, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35693359, + "step": 1129, + "time_per_iteration": 2.639261484146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_mlp": 1.05548096, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.06998872933736075, + "language_loss": 0.8949976, + "learning_rate": 0.0009107144256925133, + "loss": 0.90590858, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35620117, + "step": 1130, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096157, + "balance_loss_mlp": 1.0606091, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.08228743876345572, + "language_loss": 0.81527102, + "learning_rate": 0.0009105366699694638, + "loss": 0.82623267, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.35546875, + "step": 1131, + "time_per_iteration": 2.726532220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087405, + "balance_loss_mlp": 1.0526911, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.05363867293402688, + "language_loss": 0.81731898, + "learning_rate": 0.0009103587548619439, + "loss": 0.82819301, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.34741211, + "step": 1132, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.05978799, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.0659512575968049, + "language_loss": 0.85836411, + "learning_rate": 0.0009101806804390261, + "loss": 0.8693251, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.36328125, + "step": 1133, + "time_per_iteration": 2.789860725402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093043, + "balance_loss_mlp": 1.056494, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.06887538910693401, + "language_loss": 0.90261114, + "learning_rate": 0.0009100024467698453, + "loss": 0.91354156, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.3659668, + "step": 1134, + "time_per_iteration": 2.6074166297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.05786586, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07516267041517319, + "language_loss": 0.82424915, + "learning_rate": 0.0009098240539235981, + "loss": 0.83520383, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.37573242, + "step": 1135, + "time_per_iteration": 2.6695401668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095721, + "balance_loss_mlp": 1.05809808, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.07818229339121877, + "language_loss": 0.87811279, + "learning_rate": 0.0009096455019695423, + "loss": 0.88907003, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.3762207, + "step": 1136, + "time_per_iteration": 4.259606838226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.05180001, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.07138569527580692, + "language_loss": 0.89539087, + "learning_rate": 0.000909466790976998, + "loss": 0.90628058, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.37182617, + "step": 1137, + "time_per_iteration": 2.4586610794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086709, + "balance_loss_mlp": 1.0483948, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.07428895088203294, + "language_loss": 0.82083362, + "learning_rate": 0.0009092879210153473, + "loss": 0.83170068, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.38305664, + "step": 1138, + "time_per_iteration": 3.097928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_mlp": 1.04944801, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.07001266476470332, + "language_loss": 0.88581419, + "learning_rate": 0.0009091088921540333, + "loss": 0.89668703, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.37817383, + "step": 1139, + "time_per_iteration": 2.5904369354248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138075, + "balance_loss_mlp": 1.11270714, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.032290681216211516, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76646751, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.25390625, + "step": 1140, + "time_per_iteration": 4.913591623306274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_mlp": 1.05353999, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.1397659602768512, + "language_loss": 0.84288347, + "learning_rate": 0.0009087503580104985, + "loss": 0.85378748, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.36865234, + "step": 1141, + "time_per_iteration": 2.6825575828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_mlp": 1.06602514, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0722566511462073, + "language_loss": 0.79141879, + "learning_rate": 0.0009085708528674728, + "loss": 0.80245048, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.37133789, + "step": 1142, + "time_per_iteration": 2.8078551292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.06551528, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.06720954872782575, + "language_loss": 0.8638975, + "learning_rate": 0.0009083911891031745, + "loss": 0.87494051, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.38793945, + "step": 1143, + "time_per_iteration": 3.1356892585754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.07328963, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.08162422903338651, + "language_loss": 0.91253042, + "learning_rate": 0.0009082113667873553, + "loss": 0.92363143, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3684082, + "step": 1144, + "time_per_iteration": 3.1446304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112165, + "balance_loss_mlp": 1.07387483, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.0676762249982335, + "language_loss": 0.90471655, + "learning_rate": 0.0009080313859898283, + "loss": 0.91583818, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.38256836, + "step": 1145, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110814, + "balance_loss_mlp": 1.07082736, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.13336101787368373, + "language_loss": 0.91929018, + "learning_rate": 0.0009078512467804684, + "loss": 0.93037164, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.37304688, + "step": 1146, + "time_per_iteration": 2.6156158447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105973, + "balance_loss_mlp": 1.06882787, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06165136945539885, + "language_loss": 0.89993024, + "learning_rate": 0.0009076709492292119, + "loss": 0.91098994, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.37133789, + "step": 1147, + "time_per_iteration": 2.617534875869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095299, + "balance_loss_mlp": 1.06032324, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.11177878536303132, + "language_loss": 0.88637269, + "learning_rate": 0.0009074904934060562, + "loss": 0.89732569, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34985352, + "step": 1148, + "time_per_iteration": 2.6782190799713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086783, + "balance_loss_mlp": 1.05237889, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.0637571078176039, + "language_loss": 0.84905714, + "learning_rate": 0.0009073098793810607, + "loss": 0.85992491, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.34423828, + "step": 1149, + "time_per_iteration": 2.956638813018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085311, + "balance_loss_mlp": 1.04969168, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07731387173425769, + "language_loss": 0.8803097, + "learning_rate": 0.000907129107224346, + "loss": 0.89116287, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35595703, + "step": 1150, + "time_per_iteration": 2.724456548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04623771, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.0527541061714234, + "language_loss": 0.88156152, + "learning_rate": 0.0009069481770060939, + "loss": 0.89237529, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35180664, + "step": 1151, + "time_per_iteration": 2.6539950370788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.04811299, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.06610336138884995, + "language_loss": 0.83768857, + "learning_rate": 0.000906767088796548, + "loss": 0.84853232, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.36279297, + "step": 1152, + "time_per_iteration": 3.4304041862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.05147004, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.06692160227790218, + "language_loss": 0.87012255, + "learning_rate": 0.0009065858426660127, + "loss": 0.88099682, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.35986328, + "step": 1153, + "time_per_iteration": 2.639326333999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_mlp": 1.05480099, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.07963844060104928, + "language_loss": 0.84658396, + "learning_rate": 0.0009064044386848543, + "loss": 0.85748196, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.3503418, + "step": 1154, + "time_per_iteration": 2.904387950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_mlp": 1.05992007, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.07985092329826342, + "language_loss": 0.88786525, + "learning_rate": 0.0009062228769234997, + "loss": 0.89881229, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.34838867, + "step": 1155, + "time_per_iteration": 2.547041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095087, + "balance_loss_mlp": 1.05977738, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.067267193175655, + "language_loss": 0.80872244, + "learning_rate": 0.0009060411574524376, + "loss": 0.81967336, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35327148, + "step": 1156, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_mlp": 1.06561852, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.07018019580992392, + "language_loss": 0.87947989, + "learning_rate": 0.0009058592803422178, + "loss": 0.8904835, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34765625, + "step": 1157, + "time_per_iteration": 3.161827564239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_mlp": 1.05688405, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.0269537140509509, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79798073, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.30859375, + "step": 1158, + "time_per_iteration": 4.827271223068237 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_mlp": 1.06608617, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.10870396219255896, + "language_loss": 0.89957273, + "learning_rate": 0.00090549505348681, + "loss": 0.91057909, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.34594727, + "step": 1159, + "time_per_iteration": 2.5724213123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_mlp": 1.08144796, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.06607938149323832, + "language_loss": 0.83976638, + "learning_rate": 0.0009053127038830275, + "loss": 0.85092539, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.3449707, + "step": 1160, + "time_per_iteration": 2.979442834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108838, + "balance_loss_mlp": 1.07538772, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.07010640296313479, + "language_loss": 0.86946774, + "learning_rate": 0.000905130196922898, + "loss": 0.88055611, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3347168, + "step": 1161, + "time_per_iteration": 2.582780361175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_mlp": 1.0797379, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.056850955952103474, + "language_loss": 0.86954904, + "learning_rate": 0.0009049475326772769, + "loss": 0.88069069, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.34472656, + "step": 1162, + "time_per_iteration": 2.572434902191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116085, + "balance_loss_mlp": 1.08270645, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.07142312953148652, + "language_loss": 0.82233834, + "learning_rate": 0.0009047647112170811, + "loss": 0.83349919, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.33398438, + "step": 1163, + "time_per_iteration": 2.7467033863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_mlp": 1.07115388, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.07009650422776509, + "language_loss": 0.87291974, + "learning_rate": 0.0009045817326132876, + "loss": 0.88396937, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.33837891, + "step": 1164, + "time_per_iteration": 3.6699986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096597, + "balance_loss_mlp": 1.06150198, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.07687995911666942, + "language_loss": 0.8312459, + "learning_rate": 0.0009043985969369357, + "loss": 0.84221184, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35131836, + "step": 1165, + "time_per_iteration": 2.8716225624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_mlp": 1.06461644, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.062241931717823204, + "language_loss": 0.84419966, + "learning_rate": 0.0009042153042591245, + "loss": 0.85519511, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.34960938, + "step": 1166, + "time_per_iteration": 2.8038439750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094194, + "balance_loss_mlp": 1.05971861, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.05754676867835885, + "language_loss": 0.85229421, + "learning_rate": 0.0009040318546510146, + "loss": 0.86323619, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.3449707, + "step": 1167, + "time_per_iteration": 3.166391372680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_mlp": 1.06672144, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06328547350255756, + "language_loss": 0.84822267, + "learning_rate": 0.0009038482481838275, + "loss": 0.85923845, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.34887695, + "step": 1168, + "time_per_iteration": 2.6582534313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092575, + "balance_loss_mlp": 1.05726552, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05398415615287821, + "language_loss": 0.8685748, + "learning_rate": 0.0009036644849288455, + "loss": 0.87950051, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35327148, + "step": 1169, + "time_per_iteration": 3.131391763687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_mlp": 1.06735337, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06156740204868492, + "language_loss": 0.85189641, + "learning_rate": 0.0009034805649574118, + "loss": 0.86291689, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.34716797, + "step": 1170, + "time_per_iteration": 2.662177801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093313, + "balance_loss_mlp": 1.05991113, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.07489985201842045, + "language_loss": 0.85256809, + "learning_rate": 0.0009032964883409308, + "loss": 0.86350119, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.33422852, + "step": 1171, + "time_per_iteration": 2.872305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_mlp": 0.9971894, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.01784679187957182, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74073857, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.26171875, + "step": 1172, + "time_per_iteration": 4.968618154525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090705, + "balance_loss_mlp": 1.05649197, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.05674331384718379, + "language_loss": 0.87210125, + "learning_rate": 0.0009029278654587462, + "loss": 0.88300836, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.3425293, + "step": 1173, + "time_per_iteration": 2.5812408924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05043077, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06970392839419266, + "language_loss": 0.82089472, + "learning_rate": 0.0009027433193361548, + "loss": 0.83174634, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.34765625, + "step": 1174, + "time_per_iteration": 2.7284860610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_mlp": 1.0550499, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.05615396633220104, + "language_loss": 0.86867499, + "learning_rate": 0.00090255861685474, + "loss": 0.87957788, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.3527832, + "step": 1175, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_mlp": 1.05040812, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06159717434172949, + "language_loss": 0.91109395, + "learning_rate": 0.0009023737580862095, + "loss": 0.92195278, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.35473633, + "step": 1176, + "time_per_iteration": 2.5320050716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089039, + "balance_loss_mlp": 1.05468273, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05820331342721636, + "language_loss": 0.82901466, + "learning_rate": 0.0009021887431023321, + "loss": 0.83990508, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34399414, + "step": 1177, + "time_per_iteration": 2.619271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094278, + "balance_loss_mlp": 1.05939722, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05650773027793175, + "language_loss": 0.86773884, + "learning_rate": 0.0009020035719749369, + "loss": 0.8786816, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.34912109, + "step": 1178, + "time_per_iteration": 2.7209300994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_mlp": 1.05536032, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.07505314575513819, + "language_loss": 0.77450001, + "learning_rate": 0.0009018182447759136, + "loss": 0.78538495, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.33154297, + "step": 1179, + "time_per_iteration": 2.957627534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.05793107, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0724719412784609, + "language_loss": 0.79327267, + "learning_rate": 0.0009016327615772126, + "loss": 0.80419827, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.34619141, + "step": 1180, + "time_per_iteration": 2.9636237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098683, + "balance_loss_mlp": 1.06425512, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06868963719018656, + "language_loss": 0.87725425, + "learning_rate": 0.0009014471224508451, + "loss": 0.88824105, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34448242, + "step": 1181, + "time_per_iteration": 2.6756978034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101065, + "balance_loss_mlp": 1.06725717, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.08625014316755293, + "language_loss": 0.8279528, + "learning_rate": 0.0009012613274688823, + "loss": 0.83896345, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.33837891, + "step": 1182, + "time_per_iteration": 2.679690361022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106597, + "balance_loss_mlp": 1.0716213, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.07160666852762332, + "language_loss": 0.87420428, + "learning_rate": 0.0009010753767034565, + "loss": 0.8852703, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35009766, + "step": 1183, + "time_per_iteration": 2.56422758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110957, + "balance_loss_mlp": 1.07514668, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07593119142071596, + "language_loss": 0.7905606, + "learning_rate": 0.0009008892702267599, + "loss": 0.80167019, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.35839844, + "step": 1184, + "time_per_iteration": 2.96954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138099, + "balance_loss_mlp": 1.10255075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.08993468677273868, + "language_loss": 0.88719535, + "learning_rate": 0.0009007030081110457, + "loss": 0.89857626, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35571289, + "step": 1185, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.08923352, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.08461110053036625, + "language_loss": 0.84618473, + "learning_rate": 0.000900516590428627, + "loss": 0.85743326, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35668945, + "step": 1186, + "time_per_iteration": 2.6506764888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_mlp": 1.08637488, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.07299458038970587, + "language_loss": 0.89267749, + "learning_rate": 0.0009003300172518778, + "loss": 0.90388483, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34399414, + "step": 1187, + "time_per_iteration": 2.6919267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_mlp": 1.07291603, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.06786881834878318, + "language_loss": 0.83963048, + "learning_rate": 0.0009001432886532321, + "loss": 0.85070467, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.34521484, + "step": 1188, + "time_per_iteration": 2.9668681621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103209, + "balance_loss_mlp": 1.07002091, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06096375157572686, + "language_loss": 0.86560941, + "learning_rate": 0.0008999564047051843, + "loss": 0.87664151, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.33203125, + "step": 1189, + "time_per_iteration": 2.520157814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_mlp": 1.07070816, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.07257222459915597, + "language_loss": 0.84934878, + "learning_rate": 0.0008997693654802894, + "loss": 0.86038733, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.33154297, + "step": 1190, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117207, + "balance_loss_mlp": 1.08375657, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.056681488577390256, + "language_loss": 0.86392069, + "learning_rate": 0.0008995821710511625, + "loss": 0.87509274, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.3347168, + "step": 1191, + "time_per_iteration": 2.727444887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.08369398, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06323137320540088, + "language_loss": 0.85004956, + "learning_rate": 0.0008993948214904786, + "loss": 0.86121625, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.32983398, + "step": 1192, + "time_per_iteration": 2.5774295330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_mlp": 1.06097257, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.030992800338245956, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79508746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.25585938, + "step": 1193, + "time_per_iteration": 4.854384422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.08934152, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06852039575110529, + "language_loss": 0.7808823, + "learning_rate": 0.0008990196572654427, + "loss": 0.79210448, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.32861328, + "step": 1194, + "time_per_iteration": 2.873081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112553, + "balance_loss_mlp": 1.07943714, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.05701230798072306, + "language_loss": 0.87415946, + "learning_rate": 0.0008988318427467426, + "loss": 0.88528502, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.33105469, + "step": 1195, + "time_per_iteration": 2.702685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.06522477, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06940657308766013, + "language_loss": 0.85968834, + "learning_rate": 0.0008986438733877887, + "loss": 0.87066793, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.32739258, + "step": 1196, + "time_per_iteration": 3.4571969509124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096888, + "balance_loss_mlp": 1.06482017, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04726997036122248, + "language_loss": 0.83756924, + "learning_rate": 0.0008984557492615576, + "loss": 0.8485381, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.32055664, + "step": 1197, + "time_per_iteration": 2.9306819438934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090156, + "balance_loss_mlp": 1.05718327, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.05994921168989351, + "language_loss": 0.89349306, + "learning_rate": 0.0008982674704410854, + "loss": 0.90439463, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.32983398, + "step": 1198, + "time_per_iteration": 2.706496238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089604, + "balance_loss_mlp": 1.05648804, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06548245075345789, + "language_loss": 0.7739616, + "learning_rate": 0.0008980790369994682, + "loss": 0.78485769, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.33129883, + "step": 1199, + "time_per_iteration": 2.962169647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109754, + "balance_loss_mlp": 1.06375623, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.06722903582933262, + "language_loss": 0.86851013, + "learning_rate": 0.000897890449009863, + "loss": 0.87948549, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.33813477, + "step": 1200, + "time_per_iteration": 2.6820433139801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092921, + "balance_loss_mlp": 1.05877972, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.051980143810921, + "language_loss": 0.89933294, + "learning_rate": 0.0008977017065454853, + "loss": 0.91026211, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.34179688, + "step": 1201, + "time_per_iteration": 2.6699435710906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_mlp": 1.0640595, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.0699249838794834, + "language_loss": 0.80333388, + "learning_rate": 0.0008975128096796121, + "loss": 0.81432372, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34936523, + "step": 1202, + "time_per_iteration": 2.891552448272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0627346, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.08096245126913681, + "language_loss": 0.85447264, + "learning_rate": 0.0008973237584855794, + "loss": 0.86543471, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.33496094, + "step": 1203, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.06007552, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.07003086272099243, + "language_loss": 0.82261837, + "learning_rate": 0.0008971345530367832, + "loss": 0.83355689, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.33789062, + "step": 1204, + "time_per_iteration": 2.4648683071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_mlp": 1.05619669, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.0706025487590865, + "language_loss": 0.84670615, + "learning_rate": 0.0008969451934066799, + "loss": 0.85760665, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.33862305, + "step": 1205, + "time_per_iteration": 2.7628865242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096032, + "balance_loss_mlp": 1.06274843, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.07866862210425928, + "language_loss": 0.79702371, + "learning_rate": 0.0008967556796687854, + "loss": 0.80798399, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.33276367, + "step": 1206, + "time_per_iteration": 2.8876569271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099743, + "balance_loss_mlp": 1.06746101, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05955020850576899, + "language_loss": 0.83383894, + "learning_rate": 0.0008965660118966752, + "loss": 0.84483635, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.32275391, + "step": 1207, + "time_per_iteration": 2.8915722370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.06087792, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.05733195861059391, + "language_loss": 0.89860612, + "learning_rate": 0.0008963761901639851, + "loss": 0.90953553, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.32055664, + "step": 1208, + "time_per_iteration": 2.839872121810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_mlp": 1.06843603, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.0677808606719883, + "language_loss": 0.83122128, + "learning_rate": 0.0008961862145444103, + "loss": 0.84222686, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.32104492, + "step": 1209, + "time_per_iteration": 2.723395824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_mlp": 1.07726288, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06757554355714504, + "language_loss": 0.8539983, + "learning_rate": 0.0008959960851117059, + "loss": 0.86509824, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.32739258, + "step": 1210, + "time_per_iteration": 2.5843160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.08055305, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.06719057665627333, + "language_loss": 0.83744979, + "learning_rate": 0.0008958058019396868, + "loss": 0.84857744, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.32202148, + "step": 1211, + "time_per_iteration": 2.790137529373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_mlp": 1.07865953, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.061561154104104274, + "language_loss": 0.86634141, + "learning_rate": 0.0008956153651022274, + "loss": 0.877446, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.31787109, + "step": 1212, + "time_per_iteration": 2.6943769454956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107151, + "balance_loss_mlp": 1.07506013, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.056352889191353187, + "language_loss": 0.84060359, + "learning_rate": 0.0008954247746732618, + "loss": 0.85167515, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.32080078, + "step": 1213, + "time_per_iteration": 2.635540723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.07504261, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.059598265922157306, + "language_loss": 0.90450746, + "learning_rate": 0.0008952340307267837, + "loss": 0.91556644, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.30810547, + "step": 1214, + "time_per_iteration": 2.8842196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098908, + "balance_loss_mlp": 1.06817579, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.059513387141436946, + "language_loss": 0.83485198, + "learning_rate": 0.0008950431333368468, + "loss": 0.84584105, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.30688477, + "step": 1215, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098575, + "balance_loss_mlp": 1.06662679, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.05495395288746111, + "language_loss": 0.84313607, + "learning_rate": 0.0008948520825775634, + "loss": 0.85412186, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.31933594, + "step": 1216, + "time_per_iteration": 3.6454994678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099032, + "balance_loss_mlp": 1.06782317, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06066187191945671, + "language_loss": 0.83935732, + "learning_rate": 0.0008946608785231067, + "loss": 0.85034764, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.31176758, + "step": 1217, + "time_per_iteration": 2.9157872200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098088, + "balance_loss_mlp": 1.06599677, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.058216777953853424, + "language_loss": 0.84654021, + "learning_rate": 0.0008944695212477084, + "loss": 0.85752106, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.32080078, + "step": 1218, + "time_per_iteration": 2.473067045211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_mlp": 1.07158232, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.06075167680795146, + "language_loss": 0.86133409, + "learning_rate": 0.0008942780108256599, + "loss": 0.87237012, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.32006836, + "step": 1219, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_mlp": 1.06819737, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.07971641299609675, + "language_loss": 0.86269408, + "learning_rate": 0.0008940863473313121, + "loss": 0.87370056, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.32446289, + "step": 1220, + "time_per_iteration": 2.453798532485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_mlp": 1.0764761, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.07248436265958902, + "language_loss": 0.87226778, + "learning_rate": 0.0008938945308390756, + "loss": 0.88335222, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.31958008, + "step": 1221, + "time_per_iteration": 2.6299164295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092799, + "balance_loss_mlp": 1.06099391, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.0746326386118845, + "language_loss": 0.86801684, + "learning_rate": 0.00089370256142342, + "loss": 0.87894481, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.31787109, + "step": 1222, + "time_per_iteration": 2.7373716831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_mlp": 1.0675782, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.06792905088784162, + "language_loss": 0.84961808, + "learning_rate": 0.0008935104391588746, + "loss": 0.86061692, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.32299805, + "step": 1223, + "time_per_iteration": 2.786801338195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.06850326, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.053660170998325075, + "language_loss": 0.8281433, + "learning_rate": 0.0008933181641200276, + "loss": 0.83915687, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.32861328, + "step": 1224, + "time_per_iteration": 3.1502432823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102432, + "balance_loss_mlp": 1.06948209, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06465671729424353, + "language_loss": 0.85675979, + "learning_rate": 0.0008931257363815271, + "loss": 0.86778408, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.32958984, + "step": 1225, + "time_per_iteration": 2.9370880126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110561, + "balance_loss_mlp": 1.07370961, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.07282820073226746, + "language_loss": 0.89753437, + "learning_rate": 0.0008929331560180798, + "loss": 0.9085905, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.31884766, + "step": 1226, + "time_per_iteration": 2.977869749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122954, + "balance_loss_mlp": 1.09045768, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.053569811561680475, + "language_loss": 0.90818799, + "learning_rate": 0.0008927404231044525, + "loss": 0.91941756, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.32495117, + "step": 1227, + "time_per_iteration": 2.683979034423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111641, + "balance_loss_mlp": 1.07909656, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.06109587035495086, + "language_loss": 0.81612283, + "learning_rate": 0.0008925475377154703, + "loss": 0.82723922, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.32543945, + "step": 1228, + "time_per_iteration": 2.734614610671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119771, + "balance_loss_mlp": 1.08577275, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.06451716518904643, + "language_loss": 0.82344091, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463866, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.34033203, + "step": 1229, + "time_per_iteration": 2.740309000015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_mlp": 1.07561386, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.0665465772726836, + "language_loss": 0.91460836, + "learning_rate": 0.00089216130981104, + "loss": 0.92569423, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.32983398, + "step": 1230, + "time_per_iteration": 3.1343088150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_mlp": 1.07120848, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.061759964990198334, + "language_loss": 0.81970417, + "learning_rate": 0.000891967967445539, + "loss": 0.83074409, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.32788086, + "step": 1231, + "time_per_iteration": 2.67669677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100144, + "balance_loss_mlp": 1.06829166, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04660382532121484, + "language_loss": 0.88927996, + "learning_rate": 0.0008917744729045772, + "loss": 0.90028143, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.31835938, + "step": 1232, + "time_per_iteration": 2.87488055229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098328, + "balance_loss_mlp": 1.06695223, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.054845027384176535, + "language_loss": 0.83439517, + "learning_rate": 0.0008915808262632757, + "loss": 0.84537846, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.31347656, + "step": 1233, + "time_per_iteration": 2.884615659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_mlp": 1.0800519, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.058607558308664987, + "language_loss": 0.93242431, + "learning_rate": 0.0008913870275968148, + "loss": 0.94353569, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.31054688, + "step": 1234, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.07740974, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.0661901036623414, + "language_loss": 0.87537754, + "learning_rate": 0.0008911930769804342, + "loss": 0.88646448, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.3125, + "step": 1235, + "time_per_iteration": 3.247985363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_mlp": 1.08396649, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.053926277509791044, + "language_loss": 0.90842855, + "learning_rate": 0.0008909989744894318, + "loss": 0.91957957, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.31103516, + "step": 1236, + "time_per_iteration": 2.8457424640655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116546, + "balance_loss_mlp": 1.08598089, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.07410834458794652, + "language_loss": 0.81166267, + "learning_rate": 0.0008908047201991649, + "loss": 0.82282805, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.30517578, + "step": 1237, + "time_per_iteration": 2.743232011795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_mlp": 1.07218719, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.0897055957170317, + "language_loss": 0.8615526, + "learning_rate": 0.0008906103141850502, + "loss": 0.87258613, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.3112793, + "step": 1238, + "time_per_iteration": 2.8931751251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_mlp": 1.07164085, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.0595559706342315, + "language_loss": 0.87583494, + "learning_rate": 0.0008904157565225621, + "loss": 0.88686728, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.31567383, + "step": 1239, + "time_per_iteration": 2.681567430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096601, + "balance_loss_mlp": 1.06546402, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07926394914951292, + "language_loss": 0.81636947, + "learning_rate": 0.000890221047287235, + "loss": 0.82733548, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.31103516, + "step": 1240, + "time_per_iteration": 3.5042829513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096214, + "balance_loss_mlp": 1.06450391, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.06383986480013222, + "language_loss": 0.90398014, + "learning_rate": 0.0008900261865546615, + "loss": 0.91494226, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.31689453, + "step": 1241, + "time_per_iteration": 2.656243324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.06533027, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.07463092576288201, + "language_loss": 0.84907639, + "learning_rate": 0.0008898311744004936, + "loss": 0.86005968, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.33007812, + "step": 1242, + "time_per_iteration": 2.7337045669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.05583906, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.057670085451747476, + "language_loss": 0.86718595, + "learning_rate": 0.0008896360109004414, + "loss": 0.87808001, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.3359375, + "step": 1243, + "time_per_iteration": 2.6334750652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090579, + "balance_loss_mlp": 1.05667567, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.055695642571784755, + "language_loss": 0.84363699, + "learning_rate": 0.0008894406961302742, + "loss": 0.85454273, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.33935547, + "step": 1244, + "time_per_iteration": 2.612278699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092282, + "balance_loss_mlp": 1.05840266, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.053835846346086756, + "language_loss": 0.83682489, + "learning_rate": 0.0008892452301658201, + "loss": 0.84774774, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.33911133, + "step": 1245, + "time_per_iteration": 2.999476432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_mlp": 1.06169045, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.07830491582761978, + "language_loss": 0.83242297, + "learning_rate": 0.0008890496130829653, + "loss": 0.84337801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.33837891, + "step": 1246, + "time_per_iteration": 2.6750991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093391, + "balance_loss_mlp": 1.05913019, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.06104300334873528, + "language_loss": 0.85340333, + "learning_rate": 0.0008888538449576555, + "loss": 0.86433721, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.34301758, + "step": 1247, + "time_per_iteration": 2.5646800994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095388, + "balance_loss_mlp": 1.06131816, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.05789610317969602, + "language_loss": 0.82348001, + "learning_rate": 0.0008886579258658944, + "loss": 0.83443391, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.34082031, + "step": 1248, + "time_per_iteration": 2.562016487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.05283499, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.05381401206887855, + "language_loss": 0.84731787, + "learning_rate": 0.0008884618558837446, + "loss": 0.85818857, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.34277344, + "step": 1249, + "time_per_iteration": 2.8163750171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093014, + "balance_loss_mlp": 1.05927801, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.06053052424994898, + "language_loss": 0.86413568, + "learning_rate": 0.0008882656350873273, + "loss": 0.8750658, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.33764648, + "step": 1250, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088368, + "balance_loss_mlp": 1.05546594, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.06849099956300345, + "language_loss": 0.87088066, + "learning_rate": 0.0008880692635528219, + "loss": 0.88176429, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.32910156, + "step": 1251, + "time_per_iteration": 3.0528526306152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.048823, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.06290905233547327, + "language_loss": 0.88876319, + "learning_rate": 0.0008878727413564669, + "loss": 0.89957213, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.32055664, + "step": 1252, + "time_per_iteration": 2.758507251739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.05194211, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.04466256972049361, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81213295, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.2578125, + "step": 1253, + "time_per_iteration": 4.847649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05616474, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.059681429897919615, + "language_loss": 0.78408957, + "learning_rate": 0.0008874792452834528, + "loss": 0.79497254, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.32128906, + "step": 1254, + "time_per_iteration": 2.754746198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06061172, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.07362958371245172, + "language_loss": 0.87187612, + "learning_rate": 0.0008872822715595626, + "loss": 0.88279426, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.31176758, + "step": 1255, + "time_per_iteration": 2.662929058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_mlp": 1.06200314, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.08064600620778418, + "language_loss": 0.86789644, + "learning_rate": 0.0008870851474793598, + "loss": 0.87882906, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.31225586, + "step": 1256, + "time_per_iteration": 2.550830841064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06434524, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.05836545436632832, + "language_loss": 0.89218223, + "learning_rate": 0.0008868878731193752, + "loss": 0.90314561, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.31982422, + "step": 1257, + "time_per_iteration": 2.850184440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095001, + "balance_loss_mlp": 1.06400657, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.05536217997614851, + "language_loss": 0.89056414, + "learning_rate": 0.0008866904485561973, + "loss": 0.90151417, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.30957031, + "step": 1258, + "time_per_iteration": 2.7176461219787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107248, + "balance_loss_mlp": 1.0765636, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.0620425495695956, + "language_loss": 0.82697642, + "learning_rate": 0.000886492873866473, + "loss": 0.83804893, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.30639648, + "step": 1259, + "time_per_iteration": 2.881246328353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_mlp": 1.07631803, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.0764912621319216, + "language_loss": 0.84458697, + "learning_rate": 0.000886295149126908, + "loss": 0.85565412, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.3034668, + "step": 1260, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_mlp": 1.07148254, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05050860424869067, + "language_loss": 0.85437667, + "learning_rate": 0.0008860972744142655, + "loss": 0.86539763, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.30566406, + "step": 1261, + "time_per_iteration": 2.924192190170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_mlp": 1.07146263, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.05198228858732316, + "language_loss": 0.81767958, + "learning_rate": 0.0008858992498053671, + "loss": 0.82869458, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.30004883, + "step": 1262, + "time_per_iteration": 2.8300395011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069733, + "balance_loss_mlp": 1.04455626, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.04093384265265131, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77658486, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.25195312, + "step": 1263, + "time_per_iteration": 4.837641716003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_mlp": 1.07217157, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05948216339756903, + "language_loss": 0.83247912, + "learning_rate": 0.0008855027512063817, + "loss": 0.84351087, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.30957031, + "step": 1264, + "time_per_iteration": 2.7277276515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102812, + "balance_loss_mlp": 1.07191277, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06194442365761257, + "language_loss": 0.8589493, + "learning_rate": 0.0008853042773702292, + "loss": 0.86997747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.30859375, + "step": 1265, + "time_per_iteration": 2.7305567264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_mlp": 1.07197642, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.0568893751116151, + "language_loss": 0.87145638, + "learning_rate": 0.0008851056539456896, + "loss": 0.88248914, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.31274414, + "step": 1266, + "time_per_iteration": 2.6886072158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.06767774, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.06669847345827673, + "language_loss": 0.81623918, + "learning_rate": 0.0008849068810098755, + "loss": 0.82723451, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.31835938, + "step": 1267, + "time_per_iteration": 3.302135705947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_mlp": 1.06049967, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.06302829877877653, + "language_loss": 0.82764143, + "learning_rate": 0.0008847079586399575, + "loss": 0.83856159, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.31494141, + "step": 1268, + "time_per_iteration": 2.469602584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.05755162, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.062034835544456234, + "language_loss": 0.85665154, + "learning_rate": 0.0008845088869131641, + "loss": 0.86753917, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.31176758, + "step": 1269, + "time_per_iteration": 2.6822941303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090407, + "balance_loss_mlp": 1.05864954, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.06778965234687388, + "language_loss": 0.88905638, + "learning_rate": 0.0008843096659067818, + "loss": 0.8999604, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.31738281, + "step": 1270, + "time_per_iteration": 2.594064235687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05697237066827103, + "language_loss": 0.85987377, + "learning_rate": 0.000884110295698155, + "loss": 0.87074518, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.31567383, + "step": 1271, + "time_per_iteration": 2.974696636199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.0512805, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.06068289501227115, + "language_loss": 0.85902673, + "learning_rate": 0.0008839107763646861, + "loss": 0.86986518, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.32568359, + "step": 1272, + "time_per_iteration": 2.607771158218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_mlp": 1.0507555, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.061464799303267155, + "language_loss": 0.9008882, + "learning_rate": 0.0008837111079838353, + "loss": 0.91174459, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.34912109, + "step": 1273, + "time_per_iteration": 2.708512306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0463264, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.06335862765515422, + "language_loss": 0.89847112, + "learning_rate": 0.000883511290633121, + "loss": 0.9092629, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.32861328, + "step": 1274, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.04423904, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04937694398035677, + "language_loss": 0.92408085, + "learning_rate": 0.000883311324390119, + "loss": 0.93485993, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.33691406, + "step": 1275, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.0457077, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.07292672859625873, + "language_loss": 0.80929816, + "learning_rate": 0.0008831112093324629, + "loss": 0.82010162, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.34667969, + "step": 1276, + "time_per_iteration": 3.0507287979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.04209912, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0707858001482728, + "language_loss": 0.88982868, + "learning_rate": 0.0008829109455378444, + "loss": 0.90059322, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.34375, + "step": 1277, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.04284549, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05561589900472309, + "language_loss": 0.86233819, + "learning_rate": 0.000882710533084013, + "loss": 0.87310779, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.34155273, + "step": 1278, + "time_per_iteration": 2.623353958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074564, + "balance_loss_mlp": 1.04013681, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04936271772538766, + "language_loss": 0.89139968, + "learning_rate": 0.0008825099720487755, + "loss": 0.90214527, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.34448242, + "step": 1279, + "time_per_iteration": 2.6549813747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069233, + "balance_loss_mlp": 1.04853857, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.028817901818472227, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76330376, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.20703125, + "step": 1280, + "time_per_iteration": 4.85357141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_mlp": 1.04521215, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.026145975527968417, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79010111, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.20800781, + "step": 1281, + "time_per_iteration": 4.780989408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_mlp": 1.04983163, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.06975718656823436, + "language_loss": 0.89050984, + "learning_rate": 0.0008819073982335619, + "loss": 0.90134096, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.33300781, + "step": 1282, + "time_per_iteration": 2.8345205783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05361331, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.062337694406813374, + "language_loss": 0.84269708, + "learning_rate": 0.0008817062436519235, + "loss": 0.85355437, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.32104492, + "step": 1283, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089504, + "balance_loss_mlp": 1.05612516, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.06365108043104846, + "language_loss": 0.89943874, + "learning_rate": 0.0008815049408787788, + "loss": 0.91033375, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.33398438, + "step": 1284, + "time_per_iteration": 2.5116872787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.04916823, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.059551230096427064, + "language_loss": 0.85302055, + "learning_rate": 0.0008813034899922805, + "loss": 0.86383736, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.32519531, + "step": 1285, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080955, + "balance_loss_mlp": 1.04931688, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06660544793665324, + "language_loss": 0.89506048, + "learning_rate": 0.0008811018910706387, + "loss": 0.90586996, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.31616211, + "step": 1286, + "time_per_iteration": 2.552616834640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_mlp": 1.04756403, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.07038813341767636, + "language_loss": 0.81879961, + "learning_rate": 0.0008809001441921211, + "loss": 0.82959306, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.31762695, + "step": 1287, + "time_per_iteration": 2.704249143600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082412, + "balance_loss_mlp": 1.05132163, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.054805193397824324, + "language_loss": 0.85345185, + "learning_rate": 0.0008806982494350528, + "loss": 0.86427593, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.31054688, + "step": 1288, + "time_per_iteration": 2.65993070602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.05359983, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.05430799794632807, + "language_loss": 0.90285796, + "learning_rate": 0.0008804962068778161, + "loss": 0.91370773, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.31347656, + "step": 1289, + "time_per_iteration": 2.8633711338043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_mlp": 1.05515075, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.06485439157304855, + "language_loss": 0.81069577, + "learning_rate": 0.0008802940165988511, + "loss": 0.82155788, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.31030273, + "step": 1290, + "time_per_iteration": 2.877063274383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_mlp": 1.05341625, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.058113292585204916, + "language_loss": 0.88358063, + "learning_rate": 0.000880091678676655, + "loss": 0.89442384, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.30859375, + "step": 1291, + "time_per_iteration": 2.800182342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088307, + "balance_loss_mlp": 1.05814719, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.05744202885681841, + "language_loss": 0.88709044, + "learning_rate": 0.0008798891931897821, + "loss": 0.89797354, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.30126953, + "step": 1292, + "time_per_iteration": 2.8186981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06009781, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.06335011869227863, + "language_loss": 0.84085584, + "learning_rate": 0.0008796865602168447, + "loss": 0.85176343, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.30615234, + "step": 1293, + "time_per_iteration": 2.5642354488372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06218874, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.055204532335327836, + "language_loss": 0.88449144, + "learning_rate": 0.0008794837798365115, + "loss": 0.89542329, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.30957031, + "step": 1294, + "time_per_iteration": 2.640967607498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_mlp": 1.07256651, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05342912575045942, + "language_loss": 0.88282919, + "learning_rate": 0.0008792808521275089, + "loss": 0.8938638, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.30859375, + "step": 1295, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106969, + "balance_loss_mlp": 1.07638037, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.05542201073335728, + "language_loss": 0.87427896, + "learning_rate": 0.0008790777771686206, + "loss": 0.88534868, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.30541992, + "step": 1296, + "time_per_iteration": 2.5764553546905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_mlp": 1.07934809, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.061211557913471215, + "language_loss": 0.85332036, + "learning_rate": 0.0008788745550386872, + "loss": 0.86441755, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.30322266, + "step": 1297, + "time_per_iteration": 2.635064125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_mlp": 1.08226037, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.055423812451341224, + "language_loss": 0.79893327, + "learning_rate": 0.0008786711858166063, + "loss": 0.81006682, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.31054688, + "step": 1298, + "time_per_iteration": 3.002070903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113917, + "balance_loss_mlp": 1.08387578, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.06342841372026603, + "language_loss": 0.8358891, + "learning_rate": 0.0008784676695813332, + "loss": 0.84702826, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.29980469, + "step": 1299, + "time_per_iteration": 2.941793918609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116177, + "balance_loss_mlp": 1.08573055, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.05313888632052142, + "language_loss": 0.84205985, + "learning_rate": 0.0008782640064118796, + "loss": 0.85322165, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.30395508, + "step": 1300, + "time_per_iteration": 2.9038445949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113921, + "balance_loss_mlp": 1.11441469, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.03742785755303804, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323961, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.24804688, + "step": 1301, + "time_per_iteration": 4.97193169593811 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.0781548, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.06725713094725487, + "language_loss": 0.86707664, + "learning_rate": 0.0008778562395867648, + "loss": 0.87815738, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.29882812, + "step": 1302, + "time_per_iteration": 2.6434335708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109494, + "balance_loss_mlp": 1.064852, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.0573305289073435, + "language_loss": 0.83713615, + "learning_rate": 0.0008776521360894127, + "loss": 0.84808552, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.30029297, + "step": 1303, + "time_per_iteration": 2.664281129837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_mlp": 1.06206167, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.030879512397293623, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80049491, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.25390625, + "step": 1304, + "time_per_iteration": 4.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_mlp": 1.06682515, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.05889583885024225, + "language_loss": 0.90380585, + "learning_rate": 0.0008772434893213186, + "loss": 0.91477358, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.29882812, + "step": 1305, + "time_per_iteration": 2.619591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.06228364, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.05643683756415757, + "language_loss": 0.84055364, + "learning_rate": 0.0008770389462092276, + "loss": 0.85148358, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.30664062, + "step": 1306, + "time_per_iteration": 2.646378517150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_mlp": 1.05860949, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.07421628365380602, + "language_loss": 0.86343837, + "learning_rate": 0.0008768342567176357, + "loss": 0.87434107, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.31640625, + "step": 1307, + "time_per_iteration": 2.807349681854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_mlp": 1.0562675, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.06024308313144323, + "language_loss": 0.90521109, + "learning_rate": 0.0008766294209260107, + "loss": 0.91610324, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.32958984, + "step": 1308, + "time_per_iteration": 2.652209758758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_mlp": 1.05510211, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.07044022402077256, + "language_loss": 0.90948963, + "learning_rate": 0.0008764244389138767, + "loss": 0.92035961, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.31884766, + "step": 1309, + "time_per_iteration": 2.583214044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05386305, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.07007920023055086, + "language_loss": 0.82157373, + "learning_rate": 0.000876219310760815, + "loss": 0.83244258, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.33032227, + "step": 1310, + "time_per_iteration": 2.8652145862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_mlp": 1.05956042, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05921747328918915, + "language_loss": 0.81032491, + "learning_rate": 0.0008760140365464631, + "loss": 0.82124686, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.32641602, + "step": 1311, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05799365, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06933033432447253, + "language_loss": 0.87204492, + "learning_rate": 0.0008758086163505156, + "loss": 0.88295335, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.32861328, + "step": 1312, + "time_per_iteration": 2.5809056758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085438, + "balance_loss_mlp": 1.05253649, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.05785086559723577, + "language_loss": 0.89221275, + "learning_rate": 0.0008756030502527239, + "loss": 0.90306717, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.32910156, + "step": 1313, + "time_per_iteration": 2.8305885791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_mlp": 1.05201209, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05540107069612798, + "language_loss": 0.90540659, + "learning_rate": 0.0008753973383328954, + "loss": 0.91624713, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.3203125, + "step": 1314, + "time_per_iteration": 2.8095338344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_mlp": 1.0518887, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.06960735937341114, + "language_loss": 0.83534479, + "learning_rate": 0.0008751914806708952, + "loss": 0.84618747, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.32373047, + "step": 1315, + "time_per_iteration": 2.6356046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_mlp": 1.05357838, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.05966295966929829, + "language_loss": 0.82178831, + "learning_rate": 0.0008749854773466439, + "loss": 0.83263648, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.31201172, + "step": 1316, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083614, + "balance_loss_mlp": 1.05199969, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.060440864571565875, + "language_loss": 0.84378719, + "learning_rate": 0.0008747793284401192, + "loss": 0.85462332, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.31591797, + "step": 1317, + "time_per_iteration": 2.672581195831299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04701352, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.06760844062466466, + "language_loss": 0.85858786, + "learning_rate": 0.0008745730340313551, + "loss": 0.8693741, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.31591797, + "step": 1318, + "time_per_iteration": 2.7483184337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_mlp": 1.05775118, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.06356165501521222, + "language_loss": 0.84280074, + "learning_rate": 0.0008743665942004422, + "loss": 0.85368681, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.30834961, + "step": 1319, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094218, + "balance_loss_mlp": 1.06362879, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.06511177952096096, + "language_loss": 0.92719352, + "learning_rate": 0.0008741600090275277, + "loss": 0.93813574, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.30541992, + "step": 1320, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_mlp": 1.05758274, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.06459884228420558, + "language_loss": 0.84290528, + "learning_rate": 0.0008739532785928151, + "loss": 0.853791, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.30957031, + "step": 1321, + "time_per_iteration": 3.438142776489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166929, + "balance_loss_mlp": 1.14528096, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.062216562760273944, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7606051, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.21679688, + "step": 1322, + "time_per_iteration": 4.881207466125488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109523, + "balance_loss_mlp": 1.06502271, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.0660267567978659, + "language_loss": 0.8296389, + "learning_rate": 0.0008735393822590908, + "loss": 0.84059119, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.30151367, + "step": 1323, + "time_per_iteration": 2.7254581451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_mlp": 1.06723142, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.07409821223339019, + "language_loss": 0.87412238, + "learning_rate": 0.0008733322165207681, + "loss": 0.88509512, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.30029297, + "step": 1324, + "time_per_iteration": 2.6910648345947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_mlp": 1.07295775, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.06686348955430095, + "language_loss": 0.83012944, + "learning_rate": 0.0008731249058420247, + "loss": 0.84115636, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.29663086, + "step": 1325, + "time_per_iteration": 3.0301432609558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_mlp": 1.07499993, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.057218587703981125, + "language_loss": 0.90547103, + "learning_rate": 0.0008729174503033459, + "loss": 0.91652811, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.30664062, + "step": 1326, + "time_per_iteration": 2.668544292449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07706285, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.08872727493885958, + "language_loss": 0.82430828, + "learning_rate": 0.0008727098499852728, + "loss": 0.83538437, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.30493164, + "step": 1327, + "time_per_iteration": 2.8206427097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102439, + "balance_loss_mlp": 1.07175469, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.05995612334517853, + "language_loss": 0.8945381, + "learning_rate": 0.0008725021049684034, + "loss": 0.90556252, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.30639648, + "step": 1328, + "time_per_iteration": 2.7788021564483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110018, + "balance_loss_mlp": 1.06906641, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.07693053452424695, + "language_loss": 0.82675111, + "learning_rate": 0.000872294215333391, + "loss": 0.83775294, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.31079102, + "step": 1329, + "time_per_iteration": 3.208423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089607, + "balance_loss_mlp": 1.05820751, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05833009001407562, + "language_loss": 0.83099753, + "learning_rate": 0.0008720861811609457, + "loss": 0.84189361, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.3137207, + "step": 1330, + "time_per_iteration": 2.723451614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082701, + "balance_loss_mlp": 1.05122948, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.06841234134213905, + "language_loss": 0.83759737, + "learning_rate": 0.0008718780025318338, + "loss": 0.84842432, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.31445312, + "step": 1331, + "time_per_iteration": 2.7594637870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.05244088, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.059488371229756976, + "language_loss": 0.83890998, + "learning_rate": 0.0008716696795268771, + "loss": 0.84975058, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.31591797, + "step": 1332, + "time_per_iteration": 2.719435453414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.05516648, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.09040651922247907, + "language_loss": 0.85621184, + "learning_rate": 0.0008714612122269538, + "loss": 0.86707628, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.3125, + "step": 1333, + "time_per_iteration": 2.846071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087221, + "balance_loss_mlp": 1.05517721, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.06079891504044088, + "language_loss": 0.8881824, + "learning_rate": 0.0008712526007129982, + "loss": 0.89905459, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.3203125, + "step": 1334, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_mlp": 1.05226636, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06135189476637687, + "language_loss": 0.90600282, + "learning_rate": 0.0008710438450660003, + "loss": 0.91684425, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.31835938, + "step": 1335, + "time_per_iteration": 2.6957638263702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_mlp": 1.04984844, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.09152684925001835, + "language_loss": 0.86861122, + "learning_rate": 0.0008708349453670064, + "loss": 0.87942821, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.31835938, + "step": 1336, + "time_per_iteration": 2.569918632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.04854655, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.055029840901202824, + "language_loss": 0.91123867, + "learning_rate": 0.0008706259016971185, + "loss": 0.92204076, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.31640625, + "step": 1337, + "time_per_iteration": 2.7755186557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_mlp": 1.04554725, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.08019888390454845, + "language_loss": 0.82668757, + "learning_rate": 0.0008704167141374944, + "loss": 0.83746326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.32006836, + "step": 1338, + "time_per_iteration": 2.8559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073148, + "balance_loss_mlp": 1.04184318, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06412343972447931, + "language_loss": 0.88389909, + "learning_rate": 0.0008702073827693482, + "loss": 0.89463055, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.31274414, + "step": 1339, + "time_per_iteration": 2.725090265274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_mlp": 1.04662943, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06471871877048396, + "language_loss": 0.88798392, + "learning_rate": 0.0008699979076739494, + "loss": 0.89876378, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.31323242, + "step": 1340, + "time_per_iteration": 2.9663493633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.04354882, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.0844279622703065, + "language_loss": 0.88438749, + "learning_rate": 0.0008697882889326234, + "loss": 0.89513433, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.31103516, + "step": 1341, + "time_per_iteration": 2.5622262954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05047798, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.07114901487039385, + "language_loss": 0.86560714, + "learning_rate": 0.0008695785266267515, + "loss": 0.87642074, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.30834961, + "step": 1342, + "time_per_iteration": 2.7169957160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_mlp": 1.05309629, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06303738321086937, + "language_loss": 0.82804394, + "learning_rate": 0.0008693686208377704, + "loss": 0.83887577, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.30053711, + "step": 1343, + "time_per_iteration": 2.8591935634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_mlp": 1.06142426, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06465186244058573, + "language_loss": 0.88812125, + "learning_rate": 0.0008691585716471733, + "loss": 0.89902723, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.29150391, + "step": 1344, + "time_per_iteration": 2.6713430881500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099449, + "balance_loss_mlp": 1.07119632, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.0588719911399204, + "language_loss": 0.85261089, + "learning_rate": 0.0008689483791365079, + "loss": 0.86360538, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.28271484, + "step": 1345, + "time_per_iteration": 2.820528030395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.08457518, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.06280839806958106, + "language_loss": 0.89176255, + "learning_rate": 0.0008687380433873786, + "loss": 0.90288818, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.28027344, + "step": 1346, + "time_per_iteration": 2.8161351680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122151, + "balance_loss_mlp": 1.09442306, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.09019918884346267, + "language_loss": 0.82469404, + "learning_rate": 0.0008685275644814448, + "loss": 0.83591551, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.27734375, + "step": 1347, + "time_per_iteration": 2.693267822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_mlp": 1.09403384, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.0763626786758855, + "language_loss": 0.83996952, + "learning_rate": 0.0008683169425004216, + "loss": 0.85119361, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.28393555, + "step": 1348, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.07582057, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.0999879699530973, + "language_loss": 0.82942533, + "learning_rate": 0.0008681061775260799, + "loss": 0.84046841, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.28491211, + "step": 1349, + "time_per_iteration": 2.8389806747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104623, + "balance_loss_mlp": 1.0761795, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06848449496170159, + "language_loss": 0.9182089, + "learning_rate": 0.0008678952696402458, + "loss": 0.92925513, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.28442383, + "step": 1350, + "time_per_iteration": 2.520573377609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091244, + "balance_loss_mlp": 1.06270587, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.06363942150358032, + "language_loss": 0.86753285, + "learning_rate": 0.000867684218924801, + "loss": 0.87844533, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.28564453, + "step": 1351, + "time_per_iteration": 2.9015109539031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094999, + "balance_loss_mlp": 1.07382762, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.03643594447100183, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80042088, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.21191406, + "step": 1352, + "time_per_iteration": 4.897913217544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05987692, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.05004222260192376, + "language_loss": 0.8488791, + "learning_rate": 0.0008672616893328834, + "loss": 0.85977256, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.29394531, + "step": 1353, + "time_per_iteration": 2.930330991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_mlp": 1.05925155, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.06508424080641521, + "language_loss": 0.90170342, + "learning_rate": 0.0008670502106204512, + "loss": 0.91259539, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.29882812, + "step": 1354, + "time_per_iteration": 2.8581433296203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088042, + "balance_loss_mlp": 1.05821621, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.07357469643966064, + "language_loss": 0.81904948, + "learning_rate": 0.0008668385894064892, + "loss": 0.82992983, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.2980957, + "step": 1355, + "time_per_iteration": 2.6258199214935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086225, + "balance_loss_mlp": 1.05565977, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.05598612189883674, + "language_loss": 0.88435078, + "learning_rate": 0.0008666268257731562, + "loss": 0.89521307, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.30517578, + "step": 1356, + "time_per_iteration": 3.0935704708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_mlp": 1.06557548, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.05877228431721195, + "language_loss": 0.85582316, + "learning_rate": 0.0008664149198026662, + "loss": 0.86678505, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.3059082, + "step": 1357, + "time_per_iteration": 3.3150172233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093826, + "balance_loss_mlp": 1.06407189, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.08010917030088013, + "language_loss": 0.88609982, + "learning_rate": 0.0008662028715772883, + "loss": 0.8970381, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.29736328, + "step": 1358, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117948, + "balance_loss_mlp": 1.08781219, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.068011575409632, + "language_loss": 0.8599565, + "learning_rate": 0.0008659906811793467, + "loss": 0.87113595, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.30078125, + "step": 1359, + "time_per_iteration": 2.6895272731781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120144, + "balance_loss_mlp": 1.08917356, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06541737550876531, + "language_loss": 0.89626461, + "learning_rate": 0.0008657783486912215, + "loss": 0.90746599, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.30932617, + "step": 1360, + "time_per_iteration": 2.762763738632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_mlp": 1.09752679, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.08393806981558949, + "language_loss": 0.89884281, + "learning_rate": 0.0008655658741953472, + "loss": 0.91012919, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.31079102, + "step": 1361, + "time_per_iteration": 3.2099156379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108189, + "balance_loss_mlp": 1.07740927, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.05266132623937494, + "language_loss": 0.88221049, + "learning_rate": 0.0008653532577742136, + "loss": 0.89329231, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.30761719, + "step": 1362, + "time_per_iteration": 2.6699323654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097872, + "balance_loss_mlp": 1.06756878, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.06436829867728516, + "language_loss": 0.86740243, + "learning_rate": 0.0008651404995103659, + "loss": 0.87838113, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.30273438, + "step": 1363, + "time_per_iteration": 2.5310258865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094148, + "balance_loss_mlp": 1.06286716, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.05795299669830668, + "language_loss": 0.8642996, + "learning_rate": 0.0008649275994864041, + "loss": 0.87524116, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.3125, + "step": 1364, + "time_per_iteration": 2.675330638885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_mlp": 1.07066512, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05147405231292679, + "language_loss": 0.83778602, + "learning_rate": 0.0008647145577849834, + "loss": 0.84880447, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.31152344, + "step": 1365, + "time_per_iteration": 2.817330837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06913614, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.05119291352940178, + "language_loss": 0.82886052, + "learning_rate": 0.0008645013744888139, + "loss": 0.83985633, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.30395508, + "step": 1366, + "time_per_iteration": 2.9056894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093325, + "balance_loss_mlp": 1.06318903, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.08887633390516779, + "language_loss": 0.8772788, + "learning_rate": 0.0008642880496806607, + "loss": 0.88821203, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.30102539, + "step": 1367, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.0635649, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.0720053964715196, + "language_loss": 0.84128964, + "learning_rate": 0.0008640745834433437, + "loss": 0.85223687, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.3112793, + "step": 1368, + "time_per_iteration": 2.7703893184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_mlp": 1.05559897, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.058958451803685384, + "language_loss": 0.86905044, + "learning_rate": 0.000863860975859738, + "loss": 0.87990516, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.29833984, + "step": 1369, + "time_per_iteration": 2.913543224334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06309724, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.07885033776141591, + "language_loss": 0.87845421, + "learning_rate": 0.0008636472270127733, + "loss": 0.8893891, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.3034668, + "step": 1370, + "time_per_iteration": 2.6615941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093443, + "balance_loss_mlp": 1.06368852, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.06686078076555955, + "language_loss": 0.90047085, + "learning_rate": 0.0008634333369854345, + "loss": 0.91140521, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.29736328, + "step": 1371, + "time_per_iteration": 2.611501932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109652, + "balance_loss_mlp": 1.06666958, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05135890593758564, + "language_loss": 0.87519878, + "learning_rate": 0.0008632193058607608, + "loss": 0.88616395, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.29833984, + "step": 1372, + "time_per_iteration": 2.7420408725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096239, + "balance_loss_mlp": 1.06681848, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.07070265457366111, + "language_loss": 0.80896008, + "learning_rate": 0.0008630051337218466, + "loss": 0.81992251, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.29394531, + "step": 1373, + "time_per_iteration": 2.694157123565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097092, + "balance_loss_mlp": 1.06762338, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.06318549857397857, + "language_loss": 0.8188293, + "learning_rate": 0.0008627908206518409, + "loss": 0.82980019, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.29418945, + "step": 1374, + "time_per_iteration": 2.703380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_mlp": 1.00330341, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.017765090827900253, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76174676, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.20117188, + "step": 1375, + "time_per_iteration": 4.995063781738281 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06237197, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.0561933760173491, + "language_loss": 0.9114545, + "learning_rate": 0.0008623617720514241, + "loss": 0.92238057, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.30224609, + "step": 1376, + "time_per_iteration": 2.666578769683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093572, + "balance_loss_mlp": 1.06276798, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.06268473823371516, + "language_loss": 0.84907627, + "learning_rate": 0.0008621470366875848, + "loss": 0.86001205, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.30761719, + "step": 1377, + "time_per_iteration": 2.576968193054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087995, + "balance_loss_mlp": 1.05661869, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05801174228437736, + "language_loss": 0.87514544, + "learning_rate": 0.0008619321607257966, + "loss": 0.88602537, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.31347656, + "step": 1378, + "time_per_iteration": 2.6873912811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05396187, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.06612008054140536, + "language_loss": 0.81601393, + "learning_rate": 0.000861717144249482, + "loss": 0.82685226, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.2980957, + "step": 1379, + "time_per_iteration": 2.861531972885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082319, + "balance_loss_mlp": 1.05220687, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06041061044303736, + "language_loss": 0.89415485, + "learning_rate": 0.0008615019873421175, + "loss": 0.90497804, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.30053711, + "step": 1380, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080185, + "balance_loss_mlp": 1.04973865, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.12029414194163875, + "language_loss": 0.85435975, + "learning_rate": 0.0008612866900872349, + "loss": 0.86516166, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.30395508, + "step": 1381, + "time_per_iteration": 2.5492422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078246, + "balance_loss_mlp": 1.0483005, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.06111803920627532, + "language_loss": 0.87957448, + "learning_rate": 0.0008610712525684197, + "loss": 0.89035696, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.29882812, + "step": 1382, + "time_per_iteration": 2.632847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_mlp": 1.05356061, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.07781171288722535, + "language_loss": 0.84130585, + "learning_rate": 0.0008608556748693121, + "loss": 0.85214543, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.3034668, + "step": 1383, + "time_per_iteration": 3.246919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.05522013, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.052993237489823604, + "language_loss": 0.85963714, + "learning_rate": 0.000860639957073607, + "loss": 0.87050641, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.31689453, + "step": 1384, + "time_per_iteration": 2.7504889965057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086729, + "balance_loss_mlp": 1.05537665, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.06878538642870029, + "language_loss": 0.87610686, + "learning_rate": 0.0008604240992650534, + "loss": 0.88697416, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.31347656, + "step": 1385, + "time_per_iteration": 2.6546881198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082661, + "balance_loss_mlp": 1.05135679, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.05853696199287041, + "language_loss": 0.89197159, + "learning_rate": 0.0008602081015274545, + "loss": 0.90279818, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.31274414, + "step": 1386, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091919, + "balance_loss_mlp": 1.06061459, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.05264786586341277, + "language_loss": 0.83147365, + "learning_rate": 0.0008599919639446684, + "loss": 0.8423928, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.31274414, + "step": 1387, + "time_per_iteration": 2.6775026321411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_mlp": 1.06126583, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06747698326814106, + "language_loss": 0.79790741, + "learning_rate": 0.000859775686600607, + "loss": 0.80884051, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.3203125, + "step": 1388, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090634, + "balance_loss_mlp": 1.05921042, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.06336986871451572, + "language_loss": 0.84764999, + "learning_rate": 0.0008595592695792367, + "loss": 0.85855639, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.31396484, + "step": 1389, + "time_per_iteration": 2.6549055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.06593931, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.055901377362424544, + "language_loss": 0.90619266, + "learning_rate": 0.0008593427129645778, + "loss": 0.91716409, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.31176758, + "step": 1390, + "time_per_iteration": 2.6070477962493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_mlp": 1.06542134, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.06788313950064188, + "language_loss": 0.85213327, + "learning_rate": 0.0008591260168407052, + "loss": 0.86309791, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.31005859, + "step": 1391, + "time_per_iteration": 2.794921398162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_mlp": 1.05963671, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.052723370404498295, + "language_loss": 0.82993329, + "learning_rate": 0.0008589091812917479, + "loss": 0.84085703, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.32739258, + "step": 1392, + "time_per_iteration": 2.634734869003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088674, + "balance_loss_mlp": 1.05727446, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.06846284491975779, + "language_loss": 0.85420829, + "learning_rate": 0.0008586922064018887, + "loss": 0.86509502, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.3137207, + "step": 1393, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108591, + "balance_loss_mlp": 1.05408156, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.07721778370466406, + "language_loss": 0.89049023, + "learning_rate": 0.0008584750922553651, + "loss": 0.90134937, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.31811523, + "step": 1394, + "time_per_iteration": 3.15010666847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_mlp": 1.05053067, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.054821616219537066, + "language_loss": 0.83275163, + "learning_rate": 0.0008582578389364677, + "loss": 0.8435728, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.31567383, + "step": 1395, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086932, + "balance_loss_mlp": 1.05469775, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.049938668546041676, + "language_loss": 0.91772366, + "learning_rate": 0.0008580404465295422, + "loss": 0.92859298, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.32226562, + "step": 1396, + "time_per_iteration": 2.8488125801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_mlp": 1.04891562, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.06204428603549851, + "language_loss": 0.87966394, + "learning_rate": 0.0008578229151189876, + "loss": 0.89045662, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.30297852, + "step": 1397, + "time_per_iteration": 2.92258620262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081241, + "balance_loss_mlp": 1.04867268, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.06429333021146523, + "language_loss": 0.81249309, + "learning_rate": 0.0008576052447892573, + "loss": 0.82330555, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.32568359, + "step": 1398, + "time_per_iteration": 2.551042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.05163908, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.0671833421183549, + "language_loss": 0.86040235, + "learning_rate": 0.000857387435624858, + "loss": 0.87124133, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.32250977, + "step": 1399, + "time_per_iteration": 2.5816056728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086843, + "balance_loss_mlp": 1.05382252, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.05003222473195782, + "language_loss": 0.87953913, + "learning_rate": 0.0008571694877103513, + "loss": 0.89040762, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.33032227, + "step": 1400, + "time_per_iteration": 3.256469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108756, + "balance_loss_mlp": 1.05542135, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.056643414184275494, + "language_loss": 0.87665725, + "learning_rate": 0.0008569514011303515, + "loss": 0.88753277, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.32128906, + "step": 1401, + "time_per_iteration": 2.782273054122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_mlp": 1.05275857, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06127144796082157, + "language_loss": 0.8767277, + "learning_rate": 0.0008567331759695277, + "loss": 0.88757378, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.31835938, + "step": 1402, + "time_per_iteration": 2.696514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_mlp": 1.05178595, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.07491599518741582, + "language_loss": 0.86524475, + "learning_rate": 0.0008565148123126023, + "loss": 0.87609023, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.32763672, + "step": 1403, + "time_per_iteration": 2.6686785221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088194, + "balance_loss_mlp": 1.05510116, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.050644669708274456, + "language_loss": 0.8574301, + "learning_rate": 0.0008562963102443516, + "loss": 0.86831206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.33105469, + "step": 1404, + "time_per_iteration": 2.693836212158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05232334, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.06951419199959312, + "language_loss": 0.84958577, + "learning_rate": 0.0008560776698496056, + "loss": 0.8604449, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.33618164, + "step": 1405, + "time_per_iteration": 2.892805814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_mlp": 1.05093896, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.07287556066439085, + "language_loss": 0.85794389, + "learning_rate": 0.0008558588912132481, + "loss": 0.8687861, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.33300781, + "step": 1406, + "time_per_iteration": 2.821922540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098005, + "balance_loss_mlp": 1.07587957, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.044578698770804955, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77556992, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.22167969, + "step": 1407, + "time_per_iteration": 4.952622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082949, + "balance_loss_mlp": 1.05016637, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.05991157104862915, + "language_loss": 0.82959783, + "learning_rate": 0.0008554209195555016, + "loss": 0.84042734, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.32788086, + "step": 1408, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_mlp": 1.05403042, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.06960051295953752, + "language_loss": 0.88047969, + "learning_rate": 0.0008552017267041483, + "loss": 0.89133757, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.31738281, + "step": 1409, + "time_per_iteration": 2.7926084995269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_mlp": 1.06134176, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.07424010893339522, + "language_loss": 0.8324914, + "learning_rate": 0.0008549823959512549, + "loss": 0.8434236, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.31860352, + "step": 1410, + "time_per_iteration": 2.660325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.06724083, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.062062202361739795, + "language_loss": 0.86755967, + "learning_rate": 0.0008547629273819728, + "loss": 0.87854296, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.31054688, + "step": 1411, + "time_per_iteration": 3.3994545936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098737, + "balance_loss_mlp": 1.06736147, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06335672358829844, + "language_loss": 0.83453959, + "learning_rate": 0.0008545433210815074, + "loss": 0.84552693, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.31347656, + "step": 1412, + "time_per_iteration": 2.644434690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_mlp": 1.07123613, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.06340025797507488, + "language_loss": 0.87345338, + "learning_rate": 0.0008543235771351176, + "loss": 0.88448215, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.31616211, + "step": 1413, + "time_per_iteration": 2.7854721546173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098411, + "balance_loss_mlp": 1.0675596, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.05399278560092938, + "language_loss": 0.84545946, + "learning_rate": 0.0008541036956281154, + "loss": 0.85644352, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.30834961, + "step": 1414, + "time_per_iteration": 2.8788704872131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_mlp": 1.06056201, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.07883268546047513, + "language_loss": 0.81883514, + "learning_rate": 0.0008538836766458665, + "loss": 0.82975471, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.3137207, + "step": 1415, + "time_per_iteration": 2.8526153564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087599, + "balance_loss_mlp": 1.05732012, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.060849568603238105, + "language_loss": 0.84889638, + "learning_rate": 0.0008536635202737897, + "loss": 0.85977244, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.30224609, + "step": 1416, + "time_per_iteration": 2.837353467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_mlp": 1.05903983, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.07898075745209039, + "language_loss": 0.82057679, + "learning_rate": 0.0008534432265973573, + "loss": 0.83147448, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.30688477, + "step": 1417, + "time_per_iteration": 2.5948355197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091815, + "balance_loss_mlp": 1.05891299, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.06605458024108496, + "language_loss": 0.87714171, + "learning_rate": 0.000853222795702095, + "loss": 0.88805991, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.32910156, + "step": 1418, + "time_per_iteration": 3.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109188, + "balance_loss_mlp": 1.05842948, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.04642939327926388, + "language_loss": 0.83471483, + "learning_rate": 0.0008530022276735813, + "loss": 0.84563363, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.33447266, + "step": 1419, + "time_per_iteration": 2.711695432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_mlp": 1.05293703, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.05938997521105461, + "language_loss": 0.85724676, + "learning_rate": 0.0008527815225974489, + "loss": 0.86811179, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.3359375, + "step": 1420, + "time_per_iteration": 2.648448944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_mlp": 1.05407453, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.07492898694353861, + "language_loss": 0.87982917, + "learning_rate": 0.0008525606805593829, + "loss": 0.89069438, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.32446289, + "step": 1421, + "time_per_iteration": 2.4182560443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_mlp": 1.04997277, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.06962089633364145, + "language_loss": 0.82760686, + "learning_rate": 0.0008523397016451213, + "loss": 0.83843112, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.32446289, + "step": 1422, + "time_per_iteration": 2.587892532348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05021799, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.053513553181154576, + "language_loss": 0.8711561, + "learning_rate": 0.0008521185859404564, + "loss": 0.88199091, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.33276367, + "step": 1423, + "time_per_iteration": 3.372192859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_mlp": 1.0513202, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.059986100163812936, + "language_loss": 0.89238524, + "learning_rate": 0.0008518973335312326, + "loss": 0.90323293, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.33447266, + "step": 1424, + "time_per_iteration": 2.791482448577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082662, + "balance_loss_mlp": 1.04921198, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.06956472940992567, + "language_loss": 0.8333236, + "learning_rate": 0.0008516759445033477, + "loss": 0.84415025, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.3347168, + "step": 1425, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082757, + "balance_loss_mlp": 1.05088091, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.0615305422895171, + "language_loss": 0.84459686, + "learning_rate": 0.0008514544189427526, + "loss": 0.85542446, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.31860352, + "step": 1426, + "time_per_iteration": 2.797384738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_mlp": 1.06143463, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061840511174045036, + "language_loss": 0.86558306, + "learning_rate": 0.0008512327569354511, + "loss": 0.87652624, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.32885742, + "step": 1427, + "time_per_iteration": 2.533623695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06418157, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.06551541099381472, + "language_loss": 0.83328068, + "learning_rate": 0.0008510109585675001, + "loss": 0.84424412, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.3215332, + "step": 1428, + "time_per_iteration": 2.623915672302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10653293, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.06717437310459566, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82279044, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.19140625, + "step": 1429, + "time_per_iteration": 4.737167596817017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096832, + "balance_loss_mlp": 1.06517005, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.06718416370196487, + "language_loss": 0.80457842, + "learning_rate": 0.0008505669530941415, + "loss": 0.81554675, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.31640625, + "step": 1430, + "time_per_iteration": 3.380617141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_mlp": 1.07169294, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.06498994038544256, + "language_loss": 0.83560073, + "learning_rate": 0.000850344746161112, + "loss": 0.8466357, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.31787109, + "step": 1431, + "time_per_iteration": 2.5917775630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_mlp": 1.06883883, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.06649249705457211, + "language_loss": 0.87664711, + "learning_rate": 0.0008501224032121894, + "loss": 0.88765645, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.32080078, + "step": 1432, + "time_per_iteration": 2.493826150894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_mlp": 1.06906962, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.06530156063230687, + "language_loss": 0.8172394, + "learning_rate": 0.0008498999243336946, + "loss": 0.82825768, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.32763672, + "step": 1433, + "time_per_iteration": 2.625955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_mlp": 1.07275844, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.056445052388478564, + "language_loss": 0.87110436, + "learning_rate": 0.0008496773096120021, + "loss": 0.88214689, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.31469727, + "step": 1434, + "time_per_iteration": 2.8644402027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093048, + "balance_loss_mlp": 1.06169593, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.07767765628739494, + "language_loss": 0.84306771, + "learning_rate": 0.0008494545591335381, + "loss": 0.85399818, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.31323242, + "step": 1435, + "time_per_iteration": 2.9069130420684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094657, + "balance_loss_mlp": 1.06366265, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04344696113506711, + "language_loss": 0.86938953, + "learning_rate": 0.0008492316729847823, + "loss": 0.88033605, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.30957031, + "step": 1436, + "time_per_iteration": 2.844926595687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091812, + "balance_loss_mlp": 1.06050754, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055139322891005815, + "language_loss": 0.79749823, + "learning_rate": 0.0008490086512522664, + "loss": 0.80841637, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.31274414, + "step": 1437, + "time_per_iteration": 2.722158670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092682, + "balance_loss_mlp": 1.06121063, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.06334111858493886, + "language_loss": 0.90728873, + "learning_rate": 0.0008487854940225755, + "loss": 0.91821557, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.31445312, + "step": 1438, + "time_per_iteration": 2.43622088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.05991077, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.05907133214000555, + "language_loss": 0.89962572, + "learning_rate": 0.0008485622013823466, + "loss": 0.91054124, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.31616211, + "step": 1439, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093806, + "balance_loss_mlp": 1.06154847, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.06492331678063241, + "language_loss": 0.82635379, + "learning_rate": 0.00084833877341827, + "loss": 0.83729184, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.32250977, + "step": 1440, + "time_per_iteration": 2.625870704650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092721, + "balance_loss_mlp": 1.06139278, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.06674971698169922, + "language_loss": 0.80478823, + "learning_rate": 0.000848115210217088, + "loss": 0.81571543, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.31298828, + "step": 1441, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086558, + "balance_loss_mlp": 1.05410933, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.055312199129178424, + "language_loss": 0.81684244, + "learning_rate": 0.0008478915118655952, + "loss": 0.82770801, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.32446289, + "step": 1442, + "time_per_iteration": 2.714303493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089692, + "balance_loss_mlp": 1.05710077, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.049794988647852687, + "language_loss": 0.86386287, + "learning_rate": 0.0008476676784506393, + "loss": 0.87475979, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.32592773, + "step": 1443, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_mlp": 1.05664372, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.05900532389488003, + "language_loss": 0.82031631, + "learning_rate": 0.0008474437100591201, + "loss": 0.83119631, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.31323242, + "step": 1444, + "time_per_iteration": 3.3359997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084677, + "balance_loss_mlp": 1.05160809, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.054436577911169556, + "language_loss": 0.85231566, + "learning_rate": 0.0008472196067779898, + "loss": 0.86316246, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.33081055, + "step": 1445, + "time_per_iteration": 2.7946455478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080884, + "balance_loss_mlp": 1.04850721, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.08667298623079295, + "language_loss": 0.85239732, + "learning_rate": 0.0008469953686942531, + "loss": 0.86320615, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.32373047, + "step": 1446, + "time_per_iteration": 3.0761613845825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.04927349, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.07591437330096602, + "language_loss": 0.8283245, + "learning_rate": 0.0008467709958949668, + "loss": 0.83914101, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.32373047, + "step": 1447, + "time_per_iteration": 2.7922093868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.0504328, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.0636917665663464, + "language_loss": 0.86192262, + "learning_rate": 0.0008465464884672403, + "loss": 0.8727442, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.31713867, + "step": 1448, + "time_per_iteration": 2.679574966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_mlp": 1.05211091, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06494062959974968, + "language_loss": 0.85664314, + "learning_rate": 0.0008463218464982348, + "loss": 0.86748445, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.32006836, + "step": 1449, + "time_per_iteration": 2.8746044635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05524611, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.05859002353759583, + "language_loss": 0.87554371, + "learning_rate": 0.0008460970700751645, + "loss": 0.88640976, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.31323242, + "step": 1450, + "time_per_iteration": 3.0630292892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.05447531, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06644970008868617, + "language_loss": 0.8732717, + "learning_rate": 0.000845872159285295, + "loss": 0.8841247, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.30786133, + "step": 1451, + "time_per_iteration": 2.7334539890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149095, + "balance_loss_mlp": 1.13173842, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.04059568749878616, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78915942, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17382812, + "step": 1452, + "time_per_iteration": 4.913143634796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087672, + "balance_loss_mlp": 1.05617714, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05755695164820471, + "language_loss": 0.86085773, + "learning_rate": 0.0008454219349544836, + "loss": 0.87173438, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.31469727, + "step": 1453, + "time_per_iteration": 3.3649299144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086718, + "balance_loss_mlp": 1.05569983, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.059728326526783365, + "language_loss": 0.8137995, + "learning_rate": 0.000845196621588334, + "loss": 0.82466674, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.30981445, + "step": 1454, + "time_per_iteration": 2.7774734497070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_mlp": 1.05095196, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.0559695634724148, + "language_loss": 0.76184201, + "learning_rate": 0.0008449711742049706, + "loss": 0.77266252, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.31054688, + "step": 1455, + "time_per_iteration": 2.75393009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_mlp": 1.04814696, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.06397369460964857, + "language_loss": 0.83309555, + "learning_rate": 0.0008447455928919196, + "loss": 0.84389246, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.31518555, + "step": 1456, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082481, + "balance_loss_mlp": 1.05177259, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.06274060179370718, + "language_loss": 0.86886203, + "learning_rate": 0.0008445198777367595, + "loss": 0.87968683, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.30664062, + "step": 1457, + "time_per_iteration": 2.6488282680511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089589, + "balance_loss_mlp": 1.05883336, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.06557026121847803, + "language_loss": 0.8106361, + "learning_rate": 0.0008442940288271208, + "loss": 0.82153201, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.30712891, + "step": 1458, + "time_per_iteration": 2.67258882522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096326, + "balance_loss_mlp": 1.06454456, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.07361561415976156, + "language_loss": 0.86939961, + "learning_rate": 0.0008440680462506856, + "loss": 0.88036287, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.31762695, + "step": 1459, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_mlp": 1.07354569, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.05419081251366802, + "language_loss": 0.86197531, + "learning_rate": 0.0008438419300951883, + "loss": 0.87302566, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.31469727, + "step": 1460, + "time_per_iteration": 2.6306796073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_mlp": 1.07459426, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.08520166677325354, + "language_loss": 0.8634038, + "learning_rate": 0.0008436156804484148, + "loss": 0.87446761, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.31762695, + "step": 1461, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.0698266, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.06649626079325978, + "language_loss": 0.88025403, + "learning_rate": 0.0008433892973982031, + "loss": 0.89127588, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.32348633, + "step": 1462, + "time_per_iteration": 2.572810173034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110576, + "balance_loss_mlp": 1.07333505, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06397092621415032, + "language_loss": 0.85030043, + "learning_rate": 0.0008431627810324431, + "loss": 0.86135799, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.32421875, + "step": 1463, + "time_per_iteration": 2.6855740547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_mlp": 1.0774579, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.06457367310459801, + "language_loss": 0.81006026, + "learning_rate": 0.000842936131439076, + "loss": 0.82115412, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.3190918, + "step": 1464, + "time_per_iteration": 2.5868756771087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_mlp": 1.07188725, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06483114531916107, + "language_loss": 0.87564301, + "learning_rate": 0.0008427093487060951, + "loss": 0.88666582, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.3034668, + "step": 1465, + "time_per_iteration": 2.6775078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.07294393, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05163652452488039, + "language_loss": 0.84608126, + "learning_rate": 0.000842482432921545, + "loss": 0.85712349, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.3125, + "step": 1466, + "time_per_iteration": 2.844379186630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090816, + "balance_loss_mlp": 1.05955911, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.05726454257462379, + "language_loss": 0.86823475, + "learning_rate": 0.0008422553841735225, + "loss": 0.87914288, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.31225586, + "step": 1467, + "time_per_iteration": 2.4838902950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05624461, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.07863392491108157, + "language_loss": 0.8442952, + "learning_rate": 0.0008420282025501757, + "loss": 0.85516858, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.31054688, + "step": 1468, + "time_per_iteration": 2.7528913021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108248, + "balance_loss_mlp": 1.05169988, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.056003117579575636, + "language_loss": 0.852718, + "learning_rate": 0.0008418008881397043, + "loss": 0.86354285, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.30737305, + "step": 1469, + "time_per_iteration": 2.6801319122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078886, + "balance_loss_mlp": 1.0479157, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.04937894089719141, + "language_loss": 0.82587177, + "learning_rate": 0.0008415734410303595, + "loss": 0.83666062, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.30932617, + "step": 1470, + "time_per_iteration": 3.1880481243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04551327, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.053571151454841835, + "language_loss": 0.90790403, + "learning_rate": 0.0008413458613104444, + "loss": 0.91866791, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.30834961, + "step": 1471, + "time_per_iteration": 2.6801347732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.04832768, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.054274543729309115, + "language_loss": 0.82964969, + "learning_rate": 0.0008411181490683129, + "loss": 0.84044528, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.31201172, + "step": 1472, + "time_per_iteration": 2.732304096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107702, + "balance_loss_mlp": 1.04619205, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05901735675502878, + "language_loss": 0.82318664, + "learning_rate": 0.0008408903043923707, + "loss": 0.83395684, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.30786133, + "step": 1473, + "time_per_iteration": 3.0503528118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04906487, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.06313039437285956, + "language_loss": 0.81015414, + "learning_rate": 0.0008406623273710754, + "loss": 0.82095402, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.30883789, + "step": 1474, + "time_per_iteration": 2.606189727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05008459, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06295911479055617, + "language_loss": 0.82597101, + "learning_rate": 0.0008404342180929351, + "loss": 0.83678609, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.31396484, + "step": 1475, + "time_per_iteration": 2.620607614517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_mlp": 1.04222226, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06425181584365489, + "language_loss": 0.81938702, + "learning_rate": 0.00084020597664651, + "loss": 0.83012277, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.31323242, + "step": 1476, + "time_per_iteration": 2.7725043296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_mlp": 1.05232406, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.06074887859321084, + "language_loss": 0.83907133, + "learning_rate": 0.0008399776031204111, + "loss": 0.84990764, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.31274414, + "step": 1477, + "time_per_iteration": 2.7300467491149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092258, + "balance_loss_mlp": 1.06081057, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.05838491012274946, + "language_loss": 0.80185568, + "learning_rate": 0.0008397490976033009, + "loss": 0.81277823, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.31420898, + "step": 1478, + "time_per_iteration": 2.650667905807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080543, + "balance_loss_mlp": 1.062042, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.03640521186287318, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78960192, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.18457031, + "step": 1479, + "time_per_iteration": 4.764774322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07654858, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.05702144306517339, + "language_loss": 0.85150903, + "learning_rate": 0.0008392916909509525, + "loss": 0.86259496, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.3203125, + "step": 1480, + "time_per_iteration": 3.0437960624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_mlp": 1.07289815, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.06780557774925215, + "language_loss": 0.84802043, + "learning_rate": 0.0008390627899932954, + "loss": 0.85906273, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.31298828, + "step": 1481, + "time_per_iteration": 2.596781015396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_mlp": 1.0693903, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.07875184362779108, + "language_loss": 0.88996881, + "learning_rate": 0.000838833757399789, + "loss": 0.90097642, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.31347656, + "step": 1482, + "time_per_iteration": 2.94795560836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.05274367, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.07597770471398792, + "language_loss": 0.80484587, + "learning_rate": 0.0008386045932593515, + "loss": 0.81568611, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.3125, + "step": 1483, + "time_per_iteration": 2.6795289516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079675, + "balance_loss_mlp": 1.0484184, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.05859914190414705, + "language_loss": 0.86136287, + "learning_rate": 0.0008383752976609525, + "loss": 0.8721596, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.31225586, + "step": 1484, + "time_per_iteration": 2.900468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_mlp": 1.04878783, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.0559282187978278, + "language_loss": 0.80215633, + "learning_rate": 0.0008381458706936123, + "loss": 0.81296104, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.31665039, + "step": 1485, + "time_per_iteration": 2.6815216541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.05031872, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.06658109550051822, + "language_loss": 0.87213105, + "learning_rate": 0.0008379163124464025, + "loss": 0.88295019, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.31567383, + "step": 1486, + "time_per_iteration": 2.7246947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098145, + "balance_loss_mlp": 1.06572032, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.06266105362217729, + "language_loss": 0.76595891, + "learning_rate": 0.0008376866230084452, + "loss": 0.77694035, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.32421875, + "step": 1487, + "time_per_iteration": 2.8626444339752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_mlp": 1.07006407, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.07368717199594518, + "language_loss": 0.86109662, + "learning_rate": 0.000837456802468914, + "loss": 0.87212193, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.32470703, + "step": 1488, + "time_per_iteration": 2.5964457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109506, + "balance_loss_mlp": 1.07736683, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.0834333673185767, + "language_loss": 0.85148358, + "learning_rate": 0.0008372268509170331, + "loss": 0.86257863, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.32128906, + "step": 1489, + "time_per_iteration": 2.690129518508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109667, + "balance_loss_mlp": 1.06500769, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.06354137393554884, + "language_loss": 0.84668255, + "learning_rate": 0.0008369967684420779, + "loss": 0.85764927, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.31640625, + "step": 1490, + "time_per_iteration": 2.71195912361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_mlp": 1.0523901, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.054809792311278624, + "language_loss": 0.84395373, + "learning_rate": 0.0008367665551333736, + "loss": 0.85479403, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.31616211, + "step": 1491, + "time_per_iteration": 2.604795217514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05223465, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.06594588712207736, + "language_loss": 0.85254663, + "learning_rate": 0.0008365362110802977, + "loss": 0.86338341, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.31420898, + "step": 1492, + "time_per_iteration": 2.8853299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_mlp": 1.05619645, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.057648204576232445, + "language_loss": 0.82509673, + "learning_rate": 0.0008363057363722773, + "loss": 0.83596557, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.30664062, + "step": 1493, + "time_per_iteration": 2.8410117626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088416, + "balance_loss_mlp": 1.05916238, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.06315135639172008, + "language_loss": 0.8381595, + "learning_rate": 0.0008360751310987906, + "loss": 0.84904373, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.29199219, + "step": 1494, + "time_per_iteration": 2.6032519340515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_mlp": 1.05821633, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.0504042487563093, + "language_loss": 0.85491359, + "learning_rate": 0.0008358443953493666, + "loss": 0.865798, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.30175781, + "step": 1495, + "time_per_iteration": 2.859473943710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095118, + "balance_loss_mlp": 1.06586444, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.05765908021852543, + "language_loss": 0.87930727, + "learning_rate": 0.0008356135292135851, + "loss": 0.89025843, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.29223633, + "step": 1496, + "time_per_iteration": 2.5534088611602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092831, + "balance_loss_mlp": 1.06357718, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06886872222290924, + "language_loss": 0.91869086, + "learning_rate": 0.0008353825327810758, + "loss": 0.92961913, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.29223633, + "step": 1497, + "time_per_iteration": 2.4516804218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.0700376, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.06787386534843613, + "language_loss": 0.81638563, + "learning_rate": 0.00083515140614152, + "loss": 0.8273809, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.29467773, + "step": 1498, + "time_per_iteration": 2.6799356937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_mlp": 1.07136989, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.07094138317708479, + "language_loss": 0.861467, + "learning_rate": 0.0008349201493846485, + "loss": 0.87247133, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.2902832, + "step": 1499, + "time_per_iteration": 2.6408841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_mlp": 1.07190013, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.05864167405563355, + "language_loss": 0.88756049, + "learning_rate": 0.0008346887626002432, + "loss": 0.89857149, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.29174805, + "step": 1500, + "time_per_iteration": 2.527707099914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_mlp": 1.07277215, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.05528939811548228, + "language_loss": 0.8596012, + "learning_rate": 0.000834457245878137, + "loss": 0.87062287, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.29345703, + "step": 1501, + "time_per_iteration": 2.6287105083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097625, + "balance_loss_mlp": 1.0678941, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05829487367290223, + "language_loss": 0.81370407, + "learning_rate": 0.000834225599308212, + "loss": 0.82468033, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.296875, + "step": 1502, + "time_per_iteration": 3.2405459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097665, + "balance_loss_mlp": 1.06762409, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.0632270740356206, + "language_loss": 0.85299563, + "learning_rate": 0.0008339938229804016, + "loss": 0.86397231, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.30029297, + "step": 1503, + "time_per_iteration": 2.736917495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238462, + "balance_loss_mlp": 1.22091448, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.0713987899259734, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76673281, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17578125, + "step": 1504, + "time_per_iteration": 4.942230701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_mlp": 1.0553329, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06317842242163065, + "language_loss": 0.83872586, + "learning_rate": 0.0008335298814111094, + "loss": 0.84958482, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.30517578, + "step": 1505, + "time_per_iteration": 2.552032232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_mlp": 1.05138254, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.05888591645587949, + "language_loss": 0.87955916, + "learning_rate": 0.0008332977163497455, + "loss": 0.89038765, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.31445312, + "step": 1506, + "time_per_iteration": 2.792531728744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080802, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.058262801056698586, + "language_loss": 0.83412617, + "learning_rate": 0.0008330654218907325, + "loss": 0.84493423, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.31616211, + "step": 1507, + "time_per_iteration": 2.67161226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082791, + "balance_loss_mlp": 1.05151033, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.053562219876337476, + "language_loss": 0.8135345, + "learning_rate": 0.0008328329981242548, + "loss": 0.8243624, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3125, + "step": 1508, + "time_per_iteration": 2.8886146545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082272, + "balance_loss_mlp": 1.05006218, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.059525688681207785, + "language_loss": 0.87796283, + "learning_rate": 0.0008326004451405475, + "loss": 0.88878554, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.32202148, + "step": 1509, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081166, + "balance_loss_mlp": 1.04919386, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.06566805569484924, + "language_loss": 0.82636976, + "learning_rate": 0.0008323677630298957, + "loss": 0.83718145, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.31958008, + "step": 1510, + "time_per_iteration": 2.5723018646240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.0500108, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.0587639353811087, + "language_loss": 0.84588593, + "learning_rate": 0.0008321349518826345, + "loss": 0.85671222, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.32617188, + "step": 1511, + "time_per_iteration": 2.7943453788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085904, + "balance_loss_mlp": 1.05417013, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07149106056529789, + "language_loss": 0.94572604, + "learning_rate": 0.0008319020117891491, + "loss": 0.95658505, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.31713867, + "step": 1512, + "time_per_iteration": 2.6216046810150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_mlp": 1.05095613, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.062137158428294176, + "language_loss": 0.87139338, + "learning_rate": 0.0008316689428398751, + "loss": 0.88222551, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.32250977, + "step": 1513, + "time_per_iteration": 2.7016332149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.05217493, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.048438835392173675, + "language_loss": 0.88380623, + "learning_rate": 0.0008314357451252979, + "loss": 0.89463598, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.30761719, + "step": 1514, + "time_per_iteration": 2.7707033157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.05329311, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.17247024929444854, + "language_loss": 0.87881547, + "learning_rate": 0.0008312024187359527, + "loss": 0.88966405, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.31542969, + "step": 1515, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_mlp": 1.04083025, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.05532389066983382, + "language_loss": 0.86925149, + "learning_rate": 0.000830968963762425, + "loss": 0.8799662, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.3059082, + "step": 1516, + "time_per_iteration": 3.024911403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.03955793, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.06371457252332635, + "language_loss": 0.83926201, + "learning_rate": 0.0008307353802953497, + "loss": 0.84996927, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.3112793, + "step": 1517, + "time_per_iteration": 2.6853716373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072896, + "balance_loss_mlp": 1.04202044, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04882989118503786, + "language_loss": 0.86122108, + "learning_rate": 0.0008305016684254125, + "loss": 0.87195003, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.30859375, + "step": 1518, + "time_per_iteration": 2.799062728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_mlp": 1.04589891, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.06769299348115199, + "language_loss": 0.86794329, + "learning_rate": 0.0008302678282433479, + "loss": 0.87871796, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.31542969, + "step": 1519, + "time_per_iteration": 2.607813835144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.0473547, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.06836141022194388, + "language_loss": 0.84857148, + "learning_rate": 0.0008300338598399411, + "loss": 0.85936522, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.32006836, + "step": 1520, + "time_per_iteration": 2.6339783668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079776, + "balance_loss_mlp": 1.04677844, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.07756319993269217, + "language_loss": 0.94405806, + "learning_rate": 0.0008297997633060263, + "loss": 0.9548558, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.33007812, + "step": 1521, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_mlp": 1.03991103, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.05829817081366362, + "language_loss": 0.85078239, + "learning_rate": 0.0008295655387324883, + "loss": 0.86150956, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.328125, + "step": 1522, + "time_per_iteration": 2.8296775817871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072427, + "balance_loss_mlp": 1.04031241, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.07682732219120929, + "language_loss": 0.8501184, + "learning_rate": 0.0008293311862102609, + "loss": 0.8608427, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.32104492, + "step": 1523, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.044366, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0685602534850527, + "language_loss": 0.88674849, + "learning_rate": 0.0008290967058303275, + "loss": 0.89752042, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.32836914, + "step": 1524, + "time_per_iteration": 2.47611403465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04138136, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.06274350285183052, + "language_loss": 0.86149156, + "learning_rate": 0.0008288620976837219, + "loss": 0.87222481, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.31933594, + "step": 1525, + "time_per_iteration": 2.497141122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076595, + "balance_loss_mlp": 1.04409802, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.056882926132582716, + "language_loss": 0.82547259, + "learning_rate": 0.000828627361861527, + "loss": 0.8362385, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.32495117, + "step": 1526, + "time_per_iteration": 2.567631959915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.04157782, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.06286177552115993, + "language_loss": 0.84273493, + "learning_rate": 0.0008283924984548752, + "loss": 0.85347635, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.32568359, + "step": 1527, + "time_per_iteration": 2.8300318717956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_mlp": 1.04270601, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05246647038375997, + "language_loss": 0.84726572, + "learning_rate": 0.0008281575075549485, + "loss": 0.85802233, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.32958984, + "step": 1528, + "time_per_iteration": 2.574363946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_mlp": 1.12400758, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.05743835109314035, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78497207, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.20507812, + "step": 1529, + "time_per_iteration": 4.712693452835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085379, + "balance_loss_mlp": 1.05316901, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06778682509264199, + "language_loss": 0.90275097, + "learning_rate": 0.0008276871436402469, + "loss": 0.9136048, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.32202148, + "step": 1530, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098938, + "balance_loss_mlp": 1.06801534, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05712547612295055, + "language_loss": 0.87684029, + "learning_rate": 0.000827451770808083, + "loss": 0.88782966, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.30908203, + "step": 1531, + "time_per_iteration": 2.6601221561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_mlp": 1.06921971, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.06660356736231628, + "language_loss": 0.82939392, + "learning_rate": 0.0008272162708478674, + "loss": 0.84040606, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.31982422, + "step": 1532, + "time_per_iteration": 2.5689916610717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093792, + "balance_loss_mlp": 1.06234503, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.09954158315547566, + "language_loss": 0.86026615, + "learning_rate": 0.000826980643851029, + "loss": 0.87120402, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.31420898, + "step": 1533, + "time_per_iteration": 2.668490409851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_mlp": 1.06560588, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06068587162994625, + "language_loss": 0.84473491, + "learning_rate": 0.0008267448899090464, + "loss": 0.85570371, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.3125, + "step": 1534, + "time_per_iteration": 2.5667166709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_mlp": 1.08053756, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.07629507960375684, + "language_loss": 0.80660546, + "learning_rate": 0.0008265090091134473, + "loss": 0.81771713, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.3059082, + "step": 1535, + "time_per_iteration": 2.8708250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108767, + "balance_loss_mlp": 1.07793915, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.06117244877185189, + "language_loss": 0.80140841, + "learning_rate": 0.0008262730015558088, + "loss": 0.81249607, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.30786133, + "step": 1536, + "time_per_iteration": 2.872954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.06960511, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.058742702923310866, + "language_loss": 0.82196116, + "learning_rate": 0.0008260368673277574, + "loss": 0.8329612, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.3034668, + "step": 1537, + "time_per_iteration": 3.1321218013763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_mlp": 1.06963336, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.0781542924594719, + "language_loss": 0.83699298, + "learning_rate": 0.0008258006065209682, + "loss": 0.84798855, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.29882812, + "step": 1538, + "time_per_iteration": 2.7713711261749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108634, + "balance_loss_mlp": 1.0791415, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.060396297474130736, + "language_loss": 0.80198979, + "learning_rate": 0.0008255642192271657, + "loss": 0.81307614, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.29443359, + "step": 1539, + "time_per_iteration": 2.770426034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_mlp": 1.07525003, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.061957869610313854, + "language_loss": 0.8370012, + "learning_rate": 0.0008253277055381241, + "loss": 0.8480469, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.29296875, + "step": 1540, + "time_per_iteration": 2.818236827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101049, + "balance_loss_mlp": 1.07196212, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.0808235318545815, + "language_loss": 0.85973728, + "learning_rate": 0.0008250910655456658, + "loss": 0.8707478, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.29052734, + "step": 1541, + "time_per_iteration": 3.122596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097236, + "balance_loss_mlp": 1.06888783, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06915250684599016, + "language_loss": 0.83763367, + "learning_rate": 0.0008248542993416625, + "loss": 0.84860599, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.28369141, + "step": 1542, + "time_per_iteration": 2.5910961627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093651, + "balance_loss_mlp": 1.06408739, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.05605218699384054, + "language_loss": 0.8378318, + "learning_rate": 0.0008246174070180352, + "loss": 0.84876835, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.29516602, + "step": 1543, + "time_per_iteration": 2.6633899211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.06312323, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.07006000939384768, + "language_loss": 0.83787405, + "learning_rate": 0.0008243803886667537, + "loss": 0.84879309, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.28759766, + "step": 1544, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092222, + "balance_loss_mlp": 1.0623486, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.06063612617340172, + "language_loss": 0.78866625, + "learning_rate": 0.0008241432443798364, + "loss": 0.79958844, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.2980957, + "step": 1545, + "time_per_iteration": 2.830487012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095453, + "balance_loss_mlp": 1.06491208, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05072672460675934, + "language_loss": 0.85210156, + "learning_rate": 0.0008239059742493512, + "loss": 0.86305606, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.30493164, + "step": 1546, + "time_per_iteration": 2.7311577796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096869, + "balance_loss_mlp": 1.06654167, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.06216195389248957, + "language_loss": 0.87149853, + "learning_rate": 0.0008236685783674142, + "loss": 0.88246721, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.30273438, + "step": 1547, + "time_per_iteration": 3.122184991836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195158, + "balance_loss_mlp": 1.17408168, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.0711099730375168, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77416348, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.2109375, + "step": 1548, + "time_per_iteration": 4.884527683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_mlp": 1.08190823, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.0721948840315393, + "language_loss": 0.82155961, + "learning_rate": 0.0008231934097178955, + "loss": 0.83268768, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.30859375, + "step": 1549, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_mlp": 1.06845081, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.06744191732210313, + "language_loss": 0.85654205, + "learning_rate": 0.0008229556371347903, + "loss": 0.86754102, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.31420898, + "step": 1550, + "time_per_iteration": 2.973072052001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_mlp": 1.06530416, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.063776129703287, + "language_loss": 0.79039407, + "learning_rate": 0.0008227177391691874, + "loss": 0.80135703, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.30957031, + "step": 1551, + "time_per_iteration": 3.121493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091, + "balance_loss_mlp": 1.05948138, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.06994546641795159, + "language_loss": 0.89363164, + "learning_rate": 0.0008224797159134463, + "loss": 0.90454161, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.31494141, + "step": 1552, + "time_per_iteration": 2.714345932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_mlp": 1.05272293, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.0687696840960861, + "language_loss": 0.83498526, + "learning_rate": 0.0008222415674599765, + "loss": 0.84583527, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.32275391, + "step": 1553, + "time_per_iteration": 3.0709471702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_mlp": 1.05482578, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05942841135237563, + "language_loss": 0.83069479, + "learning_rate": 0.0008220032939012349, + "loss": 0.84156853, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.32543945, + "step": 1554, + "time_per_iteration": 2.6579041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084574, + "balance_loss_mlp": 1.05069458, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.05066559322117623, + "language_loss": 0.87862611, + "learning_rate": 0.0008217648953297277, + "loss": 0.88947189, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.33886719, + "step": 1555, + "time_per_iteration": 2.854501962661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080889, + "balance_loss_mlp": 1.04836845, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06306800858294438, + "language_loss": 0.78177649, + "learning_rate": 0.0008215263718380095, + "loss": 0.79258537, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.32519531, + "step": 1556, + "time_per_iteration": 2.679813861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_mlp": 1.03988135, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.05857921257987888, + "language_loss": 0.84453404, + "learning_rate": 0.0008212877235186833, + "loss": 0.8552593, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.32641602, + "step": 1557, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.0575211, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03849586533955073, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812063, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.16992188, + "step": 1558, + "time_per_iteration": 4.915595531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073624, + "balance_loss_mlp": 1.04193807, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06731849387550101, + "language_loss": 0.80882478, + "learning_rate": 0.0008208100527678611, + "loss": 0.81956106, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.31665039, + "step": 1559, + "time_per_iteration": 2.584726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04162097, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.07382200765663921, + "language_loss": 0.78279877, + "learning_rate": 0.0008205710305218135, + "loss": 0.79353946, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.32446289, + "step": 1560, + "time_per_iteration": 3.0383710861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074163, + "balance_loss_mlp": 1.04302561, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.058207727477831525, + "language_loss": 0.89512408, + "learning_rate": 0.0008203318838190541, + "loss": 0.90586567, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.31103516, + "step": 1561, + "time_per_iteration": 2.76627516746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077695, + "balance_loss_mlp": 1.04662895, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.06168132254821995, + "language_loss": 0.85111785, + "learning_rate": 0.0008200926127524281, + "loss": 0.86189479, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.31030273, + "step": 1562, + "time_per_iteration": 2.6629600524902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_mlp": 1.04641104, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.05613480590592382, + "language_loss": 0.82944739, + "learning_rate": 0.0008198532174148289, + "loss": 0.84022236, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.31054688, + "step": 1563, + "time_per_iteration": 2.7358763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_mlp": 1.042413, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.031593282863211954, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81745368, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.16796875, + "step": 1564, + "time_per_iteration": 4.9148335456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082495, + "balance_loss_mlp": 1.05264509, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.06408713771925002, + "language_loss": 0.88499033, + "learning_rate": 0.0008193740542985244, + "loss": 0.89581525, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.2980957, + "step": 1565, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.04955089, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.05458149708053591, + "language_loss": 0.86310005, + "learning_rate": 0.0008191342867058467, + "loss": 0.87388408, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.28833008, + "step": 1566, + "time_per_iteration": 2.7972991466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.05708098, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.07332398387540356, + "language_loss": 0.8337127, + "learning_rate": 0.0008188943952142509, + "loss": 0.84458339, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.29931641, + "step": 1567, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_mlp": 1.06203008, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.06528974392408285, + "language_loss": 0.82496703, + "learning_rate": 0.0008186543799168711, + "loss": 0.83587217, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.28491211, + "step": 1568, + "time_per_iteration": 3.1478142738342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090151, + "balance_loss_mlp": 1.06170726, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.05489125757590388, + "language_loss": 0.87973905, + "learning_rate": 0.0008184142409068892, + "loss": 0.89064056, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.28466797, + "step": 1569, + "time_per_iteration": 3.0216779708862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085926, + "balance_loss_mlp": 1.05767381, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.055531787765466835, + "language_loss": 0.86334872, + "learning_rate": 0.000818173978277536, + "loss": 0.87420803, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.2824707, + "step": 1570, + "time_per_iteration": 2.679858922958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092107, + "balance_loss_mlp": 1.06378245, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.07890485552513911, + "language_loss": 0.83764422, + "learning_rate": 0.000817933592122089, + "loss": 0.84856522, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.28344727, + "step": 1571, + "time_per_iteration": 2.7156453132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097909, + "balance_loss_mlp": 1.06936991, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.06172775968750255, + "language_loss": 0.83209121, + "learning_rate": 0.0008176930825338749, + "loss": 0.84307027, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.28564453, + "step": 1572, + "time_per_iteration": 2.6125760078430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092858, + "balance_loss_mlp": 1.06474876, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.07609523017386281, + "language_loss": 0.88406599, + "learning_rate": 0.0008174524496062679, + "loss": 0.8949945, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.28100586, + "step": 1573, + "time_per_iteration": 2.9266738891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093192, + "balance_loss_mlp": 1.06472516, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.061281594343297996, + "language_loss": 0.85176635, + "learning_rate": 0.0008172116934326894, + "loss": 0.86269826, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.28466797, + "step": 1574, + "time_per_iteration": 2.78182315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093702, + "balance_loss_mlp": 1.06499696, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.061003462460527645, + "language_loss": 0.87581599, + "learning_rate": 0.0008169708141066097, + "loss": 0.88675308, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.28686523, + "step": 1575, + "time_per_iteration": 2.579521894454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095615, + "balance_loss_mlp": 1.06631374, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06494361929352876, + "language_loss": 0.90285015, + "learning_rate": 0.0008167298117215465, + "loss": 0.91380632, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.29272461, + "step": 1576, + "time_per_iteration": 2.576373815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109664, + "balance_loss_mlp": 1.06729078, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06029453435911351, + "language_loss": 0.87511861, + "learning_rate": 0.0008164886863710649, + "loss": 0.88608503, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.29296875, + "step": 1577, + "time_per_iteration": 2.913679599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06847095, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.06219192746352704, + "language_loss": 0.86087388, + "learning_rate": 0.0008162474381487783, + "loss": 0.87184995, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.29101562, + "step": 1578, + "time_per_iteration": 3.0120038986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089575, + "balance_loss_mlp": 1.05979693, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.07133259007734825, + "language_loss": 0.84352636, + "learning_rate": 0.0008160060671483475, + "loss": 0.85442215, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.29711914, + "step": 1579, + "time_per_iteration": 2.6448450088500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087505, + "balance_loss_mlp": 1.05729711, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.06969729270721756, + "language_loss": 0.83291966, + "learning_rate": 0.0008157645734634809, + "loss": 0.8437947, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.30200195, + "step": 1580, + "time_per_iteration": 2.623994827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219684, + "balance_loss_mlp": 1.20118308, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.06785469110901753, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78116179, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.18457031, + "step": 1581, + "time_per_iteration": 4.945984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134498, + "balance_loss_mlp": 1.11723626, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.04727039603147748, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74348998, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17285156, + "step": 1582, + "time_per_iteration": 4.907581567764282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094198, + "balance_loss_mlp": 1.06482506, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.06103997784231323, + "language_loss": 0.83613545, + "learning_rate": 0.000815039357240067, + "loss": 0.84707743, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.29345703, + "step": 1583, + "time_per_iteration": 2.6569504737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_mlp": 1.07053173, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.05926881191118497, + "language_loss": 0.85445809, + "learning_rate": 0.0008147973737554952, + "loss": 0.86544669, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.28344727, + "step": 1584, + "time_per_iteration": 2.8048319816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105359, + "balance_loss_mlp": 1.07682085, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.06192456547731419, + "language_loss": 0.85451925, + "learning_rate": 0.000814555268055744, + "loss": 0.86557281, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.28540039, + "step": 1585, + "time_per_iteration": 2.6496644020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.08265996, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.06812003210241727, + "language_loss": 0.87046736, + "learning_rate": 0.0008143130402348073, + "loss": 0.88158417, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.28979492, + "step": 1586, + "time_per_iteration": 2.6643214225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_mlp": 1.07644498, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.055468457342214825, + "language_loss": 0.79345113, + "learning_rate": 0.0008140706903867265, + "loss": 0.80450928, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.29345703, + "step": 1587, + "time_per_iteration": 2.793938159942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095768, + "balance_loss_mlp": 1.06610858, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.06572122415162869, + "language_loss": 0.90151691, + "learning_rate": 0.0008138282186055897, + "loss": 0.91247463, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.29614258, + "step": 1588, + "time_per_iteration": 2.7083215713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.06414866, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.07456080522357873, + "language_loss": 0.82026887, + "learning_rate": 0.0008135856249855331, + "loss": 0.83120513, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.29467773, + "step": 1589, + "time_per_iteration": 2.6640753746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05720115, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06169186885540492, + "language_loss": 0.89804673, + "learning_rate": 0.0008133429096207398, + "loss": 0.90891039, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.29125977, + "step": 1590, + "time_per_iteration": 2.7599587440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180768, + "balance_loss_mlp": 1.16407835, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.058161185258212886, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76493025, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.16699219, + "step": 1591, + "time_per_iteration": 4.928807973861694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092058, + "balance_loss_mlp": 1.06149244, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05378358074526122, + "language_loss": 0.86363673, + "learning_rate": 0.0008128571140339123, + "loss": 0.87455726, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.30517578, + "step": 1592, + "time_per_iteration": 2.6374073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.06182945, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.059608258439458016, + "language_loss": 0.87261879, + "learning_rate": 0.0008126140340004805, + "loss": 0.88355112, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.3137207, + "step": 1593, + "time_per_iteration": 2.5177900791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_mlp": 1.07528496, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.05384575425533411, + "language_loss": 0.82083076, + "learning_rate": 0.0008123708325995172, + "loss": 0.83190024, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.31640625, + "step": 1594, + "time_per_iteration": 3.230646848678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_mlp": 1.07466626, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.05828956025392548, + "language_loss": 0.79435146, + "learning_rate": 0.0008121275099254414, + "loss": 0.80541706, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.31884766, + "step": 1595, + "time_per_iteration": 2.902198553085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_mlp": 1.07000458, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.0810481792888773, + "language_loss": 0.87996, + "learning_rate": 0.0008118840660727194, + "loss": 0.89096785, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.30761719, + "step": 1596, + "time_per_iteration": 2.6448442935943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_mlp": 1.05465174, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.06221817840069264, + "language_loss": 0.87278962, + "learning_rate": 0.0008116405011358644, + "loss": 0.88365012, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.3137207, + "step": 1597, + "time_per_iteration": 3.1513490676879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_mlp": 1.05455184, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05780846158028219, + "language_loss": 0.79670262, + "learning_rate": 0.0008113968152094369, + "loss": 0.80755049, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.30175781, + "step": 1598, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_mlp": 1.05160582, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.05742950260468591, + "language_loss": 0.822034, + "learning_rate": 0.0008111530083880438, + "loss": 0.83285123, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.30078125, + "step": 1599, + "time_per_iteration": 2.9002020359039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.05333805, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.066825138462863, + "language_loss": 0.86253393, + "learning_rate": 0.0008109090807663399, + "loss": 0.87336552, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.29760742, + "step": 1600, + "time_per_iteration": 2.8091297149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078593, + "balance_loss_mlp": 1.04921985, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.05248494232095894, + "language_loss": 0.88362008, + "learning_rate": 0.0008106650324390257, + "loss": 0.89440602, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.29370117, + "step": 1601, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080904, + "balance_loss_mlp": 1.05072021, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06836714374526962, + "language_loss": 0.81128752, + "learning_rate": 0.0008104208635008493, + "loss": 0.82209659, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.30151367, + "step": 1602, + "time_per_iteration": 2.6952836513519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_mlp": 1.05665243, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.06376665529861299, + "language_loss": 0.81538713, + "learning_rate": 0.0008101765740466058, + "loss": 0.82624954, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.29541016, + "step": 1603, + "time_per_iteration": 2.4948389530181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080977, + "balance_loss_mlp": 1.05098414, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.06931980864978393, + "language_loss": 0.84338289, + "learning_rate": 0.0008099321641711364, + "loss": 0.85419261, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.29931641, + "step": 1604, + "time_per_iteration": 2.707308769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093892, + "balance_loss_mlp": 1.06249225, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.060864651717696075, + "language_loss": 0.83160985, + "learning_rate": 0.0008096876339693295, + "loss": 0.84254879, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.3137207, + "step": 1605, + "time_per_iteration": 2.731968402862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094701, + "balance_loss_mlp": 1.06353974, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.06509347225319946, + "language_loss": 0.8101337, + "learning_rate": 0.0008094429835361206, + "loss": 0.8210808, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.3112793, + "step": 1606, + "time_per_iteration": 2.9290759563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05914617, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.057098253953708926, + "language_loss": 0.8565855, + "learning_rate": 0.0008091982129664908, + "loss": 0.86748546, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.30810547, + "step": 1607, + "time_per_iteration": 2.698822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087412, + "balance_loss_mlp": 1.05558348, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.06809183454795278, + "language_loss": 0.82921505, + "learning_rate": 0.0008089533223554687, + "loss": 0.8400892, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.31811523, + "step": 1608, + "time_per_iteration": 2.7226502895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.05116844, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05457453553086006, + "language_loss": 0.85192972, + "learning_rate": 0.0008087083117981294, + "loss": 0.86274683, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.30493164, + "step": 1609, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_mlp": 1.04733825, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.05682891267097286, + "language_loss": 0.87723553, + "learning_rate": 0.0008084631813895943, + "loss": 0.88802552, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.31665039, + "step": 1610, + "time_per_iteration": 2.8217973709106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077424, + "balance_loss_mlp": 1.04538095, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.06653230383850259, + "language_loss": 0.83695799, + "learning_rate": 0.0008082179312250315, + "loss": 0.84773219, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.3203125, + "step": 1611, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.13905036, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.03907624866068961, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81013775, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18847656, + "step": 1612, + "time_per_iteration": 4.846347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142611, + "balance_loss_mlp": 1.12401426, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.03590336133433786, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77771938, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.18554688, + "step": 1613, + "time_per_iteration": 5.076608896255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_mlp": 1.05432057, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06574200684353006, + "language_loss": 0.81847739, + "learning_rate": 0.0008074814631475545, + "loss": 0.829337, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.31616211, + "step": 1614, + "time_per_iteration": 3.354888916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_mlp": 1.05552983, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.058665683967318874, + "language_loss": 0.79078931, + "learning_rate": 0.0008072357349114907, + "loss": 0.80165768, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.31274414, + "step": 1615, + "time_per_iteration": 2.66959810256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085653, + "balance_loss_mlp": 1.05427742, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.07028059658598983, + "language_loss": 0.88604105, + "learning_rate": 0.0008069898873959363, + "loss": 0.89689755, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.31347656, + "step": 1616, + "time_per_iteration": 2.652873992919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081821, + "balance_loss_mlp": 1.04932451, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.0549356144381418, + "language_loss": 0.85724425, + "learning_rate": 0.0008067439206963375, + "loss": 0.86806244, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32495117, + "step": 1617, + "time_per_iteration": 2.651966094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_mlp": 1.04707837, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06196009796144799, + "language_loss": 0.86023569, + "learning_rate": 0.0008064978349081873, + "loss": 0.87101597, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.30908203, + "step": 1618, + "time_per_iteration": 2.9655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_mlp": 1.04403007, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.05286958899784421, + "language_loss": 0.86531937, + "learning_rate": 0.0008062516301270245, + "loss": 0.87608671, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.32714844, + "step": 1619, + "time_per_iteration": 2.6688730716705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.04668832, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.04767982292239376, + "language_loss": 0.88103712, + "learning_rate": 0.0008060053064484343, + "loss": 0.89181346, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.30908203, + "step": 1620, + "time_per_iteration": 2.9296655654907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_mlp": 1.04794526, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.062218975842766755, + "language_loss": 0.85253787, + "learning_rate": 0.0008057588639680482, + "loss": 0.86332226, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.3046875, + "step": 1621, + "time_per_iteration": 2.7567451000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077048, + "balance_loss_mlp": 1.04686427, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06694670244497776, + "language_loss": 0.82797694, + "learning_rate": 0.0008055123027815434, + "loss": 0.83874738, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.30151367, + "step": 1622, + "time_per_iteration": 2.9208602905273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077079, + "balance_loss_mlp": 1.04610825, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.1782498685509151, + "language_loss": 0.84590065, + "learning_rate": 0.0008052656229846436, + "loss": 0.85667145, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.30932617, + "step": 1623, + "time_per_iteration": 2.7155866622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073968, + "balance_loss_mlp": 1.04328322, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.060959339396114136, + "language_loss": 0.90353578, + "learning_rate": 0.0008050188246731182, + "loss": 0.91427553, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.30664062, + "step": 1624, + "time_per_iteration": 2.6797330379486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076074, + "balance_loss_mlp": 1.04412627, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.055606567643031936, + "language_loss": 0.81689882, + "learning_rate": 0.0008047719079427834, + "loss": 0.82765961, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.31933594, + "step": 1625, + "time_per_iteration": 3.0065042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_mlp": 1.11031902, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.04475298972307083, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75482148, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.20117188, + "step": 1626, + "time_per_iteration": 4.811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_mlp": 1.04688525, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.07327685166102689, + "language_loss": 0.86126161, + "learning_rate": 0.0008042777196091757, + "loss": 0.87205535, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.32495117, + "step": 1627, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05241048, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.055253724304277024, + "language_loss": 0.81718934, + "learning_rate": 0.0008040304481977643, + "loss": 0.82803679, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.32324219, + "step": 1628, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.0556109, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.07469207399290811, + "language_loss": 0.86699098, + "learning_rate": 0.0008037830587512649, + "loss": 0.87787557, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.32861328, + "step": 1629, + "time_per_iteration": 3.092052459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_mlp": 1.0538609, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.05491200172004239, + "language_loss": 0.78946573, + "learning_rate": 0.0008035355513657224, + "loss": 0.80032265, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.31811523, + "step": 1630, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_mlp": 1.05111051, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.05139869194515267, + "language_loss": 0.92925692, + "learning_rate": 0.0008032879261372279, + "loss": 0.94008344, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.31518555, + "step": 1631, + "time_per_iteration": 2.779520034790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.05868566, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.031013784922197977, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80712551, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.18066406, + "step": 1632, + "time_per_iteration": 5.371822357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_mlp": 1.04828787, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.055553714952817974, + "language_loss": 0.87074977, + "learning_rate": 0.0008027923225359748, + "loss": 0.8815397, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.30688477, + "step": 1633, + "time_per_iteration": 2.6381123065948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_mlp": 1.04797852, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05859649155609266, + "language_loss": 0.88228178, + "learning_rate": 0.0008025443443556267, + "loss": 0.89307147, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.30957031, + "step": 1634, + "time_per_iteration": 2.7031404972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.04785156, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.052081770011180493, + "language_loss": 0.88152099, + "learning_rate": 0.000802296248717147, + "loss": 0.89230251, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.30273438, + "step": 1635, + "time_per_iteration": 2.9598543643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.05080533, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.066530556652877, + "language_loss": 0.78616363, + "learning_rate": 0.0008020480357168554, + "loss": 0.79697067, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.29833984, + "step": 1636, + "time_per_iteration": 2.797565221786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05261683, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.1046412191682548, + "language_loss": 0.87883365, + "learning_rate": 0.0008017997054511165, + "loss": 0.88965666, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.29638672, + "step": 1637, + "time_per_iteration": 2.559032440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078208, + "balance_loss_mlp": 1.04733276, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.05513941849331592, + "language_loss": 0.85624552, + "learning_rate": 0.0008015512580163407, + "loss": 0.86702752, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.30834961, + "step": 1638, + "time_per_iteration": 2.779050827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04363525, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.05557291013478606, + "language_loss": 0.81019449, + "learning_rate": 0.0008013026935089838, + "loss": 0.82094443, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.31323242, + "step": 1639, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04701638, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.06613944709877946, + "language_loss": 0.8358075, + "learning_rate": 0.0008010540120255472, + "loss": 0.84657711, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.29882812, + "step": 1640, + "time_per_iteration": 2.651386260986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077047, + "balance_loss_mlp": 1.0463388, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.07317243700129339, + "language_loss": 0.86339968, + "learning_rate": 0.0008008052136625774, + "loss": 0.87417012, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.30688477, + "step": 1641, + "time_per_iteration": 2.7859702110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077184, + "balance_loss_mlp": 1.04642797, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05078324108170858, + "language_loss": 0.86915755, + "learning_rate": 0.0008005562985166666, + "loss": 0.87992936, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.30712891, + "step": 1642, + "time_per_iteration": 2.770359516143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04775047, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.048579646337906, + "language_loss": 0.85256124, + "learning_rate": 0.0008003072666844524, + "loss": 0.86334682, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.30761719, + "step": 1643, + "time_per_iteration": 2.6892380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081754, + "balance_loss_mlp": 1.05076003, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.06943709441331726, + "language_loss": 0.82542813, + "learning_rate": 0.0008000581182626173, + "loss": 0.83624566, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.30981445, + "step": 1644, + "time_per_iteration": 2.550408124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05496669, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.05777646040930187, + "language_loss": 0.86256635, + "learning_rate": 0.0007998088533478894, + "loss": 0.87341708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.30053711, + "step": 1645, + "time_per_iteration": 2.646522283554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081027, + "balance_loss_mlp": 1.05019915, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07748310873558778, + "language_loss": 0.84388101, + "learning_rate": 0.000799559472037042, + "loss": 0.85469127, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.30786133, + "step": 1646, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081594, + "balance_loss_mlp": 1.05112433, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.0644603274178606, + "language_loss": 0.87469906, + "learning_rate": 0.0007993099744268932, + "loss": 0.88551497, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.30419922, + "step": 1647, + "time_per_iteration": 2.905468225479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074972, + "balance_loss_mlp": 1.04414475, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.06139744482341488, + "language_loss": 0.87846816, + "learning_rate": 0.000799060360614307, + "loss": 0.88921791, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.30786133, + "step": 1648, + "time_per_iteration": 2.6811182498931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083311, + "balance_loss_mlp": 1.05250716, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05150264807756507, + "language_loss": 0.83281147, + "learning_rate": 0.0007988106306961917, + "loss": 0.84364462, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.30761719, + "step": 1649, + "time_per_iteration": 3.132918119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_mlp": 1.04840076, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.0787550229152594, + "language_loss": 0.84213352, + "learning_rate": 0.0007985607847695014, + "loss": 0.85291457, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.29663086, + "step": 1650, + "time_per_iteration": 2.690056085586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04784608, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.0566788479410698, + "language_loss": 0.82883936, + "learning_rate": 0.0007983108229312345, + "loss": 0.83962488, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.30664062, + "step": 1651, + "time_per_iteration": 2.918217182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.04679036, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0674507609019882, + "language_loss": 0.86496019, + "learning_rate": 0.0007980607452784351, + "loss": 0.87573761, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.30908203, + "step": 1652, + "time_per_iteration": 2.5508391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_mlp": 1.052019, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.06063063486045483, + "language_loss": 0.90349394, + "learning_rate": 0.0007978105519081919, + "loss": 0.91431332, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.29858398, + "step": 1653, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079168, + "balance_loss_mlp": 1.04910302, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.0738675373878511, + "language_loss": 0.87538201, + "learning_rate": 0.0007975602429176385, + "loss": 0.88617373, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.30004883, + "step": 1654, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05356312, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.051475836139836105, + "language_loss": 0.81585073, + "learning_rate": 0.0007973098184039536, + "loss": 0.82669556, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.30883789, + "step": 1655, + "time_per_iteration": 2.66395902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_mlp": 1.05291927, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.059751712008043044, + "language_loss": 0.86801946, + "learning_rate": 0.0007970592784643602, + "loss": 0.87885141, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.30224609, + "step": 1656, + "time_per_iteration": 2.9186086654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_mlp": 1.05855238, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.07875703275612048, + "language_loss": 0.85285407, + "learning_rate": 0.0007968086231961272, + "loss": 0.86373335, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.29321289, + "step": 1657, + "time_per_iteration": 2.6505343914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089245, + "balance_loss_mlp": 1.05941832, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08653253817480935, + "language_loss": 0.8381049, + "learning_rate": 0.0007965578526965671, + "loss": 0.84899735, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.29785156, + "step": 1658, + "time_per_iteration": 2.5884180068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089397, + "balance_loss_mlp": 1.05995274, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.05523051502884026, + "language_loss": 0.86312473, + "learning_rate": 0.0007963069670630377, + "loss": 0.87401861, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.29394531, + "step": 1659, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089678, + "balance_loss_mlp": 1.05997133, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.06732717892338919, + "language_loss": 0.8810066, + "learning_rate": 0.0007960559663929416, + "loss": 0.89190334, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.29663086, + "step": 1660, + "time_per_iteration": 2.6370737552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.06633985, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.0532651376254825, + "language_loss": 0.87495023, + "learning_rate": 0.0007958048507837259, + "loss": 0.88591546, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.30151367, + "step": 1661, + "time_per_iteration": 2.942779779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_mlp": 1.06316066, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.07710421129836972, + "language_loss": 0.87092876, + "learning_rate": 0.0007955536203328822, + "loss": 0.8818627, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.30175781, + "step": 1662, + "time_per_iteration": 2.8991520404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100595, + "balance_loss_mlp": 1.07072091, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.05380031942726595, + "language_loss": 0.8344577, + "learning_rate": 0.0007953022751379469, + "loss": 0.84546363, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.2980957, + "step": 1663, + "time_per_iteration": 2.795117139816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_mlp": 1.07239294, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.0657811186180598, + "language_loss": 0.81884921, + "learning_rate": 0.000795050815296501, + "loss": 0.82987475, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.30151367, + "step": 1664, + "time_per_iteration": 2.969935894012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099283, + "balance_loss_mlp": 1.06890798, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.058736361347452894, + "language_loss": 0.93026185, + "learning_rate": 0.0007947992409061695, + "loss": 0.94125462, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.30322266, + "step": 1665, + "time_per_iteration": 2.585144281387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06182027, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05523611327933496, + "language_loss": 0.8654207, + "learning_rate": 0.0007945475520646226, + "loss": 0.87634689, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.30761719, + "step": 1666, + "time_per_iteration": 2.9349849224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092223, + "balance_loss_mlp": 1.06249237, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.05521997897435197, + "language_loss": 0.84546125, + "learning_rate": 0.0007942957488695743, + "loss": 0.85638344, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.296875, + "step": 1667, + "time_per_iteration": 2.6538572311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.0539664, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.05331163349230756, + "language_loss": 0.81038171, + "learning_rate": 0.0007940438314187833, + "loss": 0.82121915, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.29760742, + "step": 1668, + "time_per_iteration": 3.009927988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108075, + "balance_loss_mlp": 1.05016077, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.06087879277496283, + "language_loss": 0.80221838, + "learning_rate": 0.0007937917998100529, + "loss": 0.81302583, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.30541992, + "step": 1669, + "time_per_iteration": 2.5703017711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072786, + "balance_loss_mlp": 1.0426501, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.07064769089672658, + "language_loss": 0.78527176, + "learning_rate": 0.0007935396541412302, + "loss": 0.79599965, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.30102539, + "step": 1670, + "time_per_iteration": 2.625499725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081422, + "balance_loss_mlp": 1.05099988, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.0720065018777928, + "language_loss": 0.8546167, + "learning_rate": 0.0007932873945102068, + "loss": 0.86543095, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.30395508, + "step": 1671, + "time_per_iteration": 2.6188762187957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074685, + "balance_loss_mlp": 1.05713737, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.027722134190714592, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76836461, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.17578125, + "step": 1672, + "time_per_iteration": 4.9278037548065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081072, + "balance_loss_mlp": 1.05057812, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.053011814820585035, + "language_loss": 0.86121267, + "learning_rate": 0.0007927825337533461, + "loss": 0.87202334, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.3046875, + "step": 1673, + "time_per_iteration": 2.6787123680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075926, + "balance_loss_mlp": 1.0452652, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06681709765508774, + "language_loss": 0.84770656, + "learning_rate": 0.0007925299328235131, + "loss": 0.85846579, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.30615234, + "step": 1674, + "time_per_iteration": 2.638434410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080022, + "balance_loss_mlp": 1.04890847, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.06949369164102485, + "language_loss": 0.84795958, + "learning_rate": 0.000792277218323488, + "loss": 0.85875976, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.31103516, + "step": 1675, + "time_per_iteration": 2.5852880477905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04653537, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.06490362841252771, + "language_loss": 0.84737194, + "learning_rate": 0.0007920243903513833, + "loss": 0.85814989, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.31225586, + "step": 1676, + "time_per_iteration": 2.558058261871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_mlp": 1.0523684, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.0667244817356676, + "language_loss": 0.83645618, + "learning_rate": 0.0007917714490053556, + "loss": 0.84729266, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.3125, + "step": 1677, + "time_per_iteration": 2.6619315147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.05046487, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.05833648566333407, + "language_loss": 0.85744321, + "learning_rate": 0.0007915183943836055, + "loss": 0.8682673, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.31933594, + "step": 1678, + "time_per_iteration": 2.8658525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04729617, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.06725353636254193, + "language_loss": 0.84315777, + "learning_rate": 0.0007912652265843773, + "loss": 0.8539505, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.31958008, + "step": 1679, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_mlp": 1.05019951, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.062193961969532426, + "language_loss": 0.81564045, + "learning_rate": 0.0007910119457059597, + "loss": 0.82647079, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.32836914, + "step": 1680, + "time_per_iteration": 2.6963257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05333161, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.0682304205879652, + "language_loss": 0.80304003, + "learning_rate": 0.0007907585518466849, + "loss": 0.81389421, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.32080078, + "step": 1681, + "time_per_iteration": 2.969540596008301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081665, + "balance_loss_mlp": 1.05026531, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.06175447283803796, + "language_loss": 0.89361274, + "learning_rate": 0.000790505045104929, + "loss": 0.90442938, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.3137207, + "step": 1682, + "time_per_iteration": 2.5148813724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_mlp": 1.05108356, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.061424377243362256, + "language_loss": 0.87097234, + "learning_rate": 0.0007902514255791125, + "loss": 0.88180125, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.31787109, + "step": 1683, + "time_per_iteration": 2.7773754596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078151, + "balance_loss_mlp": 1.04696608, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.06766194852988328, + "language_loss": 0.87911332, + "learning_rate": 0.0007899976933676986, + "loss": 0.88989484, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.31176758, + "step": 1684, + "time_per_iteration": 2.9700520038604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_mlp": 1.04589295, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.061649412189834635, + "language_loss": 0.87300712, + "learning_rate": 0.0007897438485691955, + "loss": 0.88378721, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.32104492, + "step": 1685, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04483223, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.06379930216662907, + "language_loss": 0.823452, + "learning_rate": 0.0007894898912821542, + "loss": 0.83422434, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.32397461, + "step": 1686, + "time_per_iteration": 2.5478906631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071757, + "balance_loss_mlp": 1.03978539, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.05321818652056826, + "language_loss": 0.86522776, + "learning_rate": 0.0007892358216051695, + "loss": 0.87594533, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.31958008, + "step": 1687, + "time_per_iteration": 2.735633134841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075777, + "balance_loss_mlp": 1.04251742, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.0608133700269358, + "language_loss": 0.91922832, + "learning_rate": 0.0007889816396368803, + "loss": 0.92998612, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.33276367, + "step": 1688, + "time_per_iteration": 2.6234939098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077878, + "balance_loss_mlp": 1.04497576, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.0630363811740232, + "language_loss": 0.85370868, + "learning_rate": 0.0007887273454759687, + "loss": 0.86448747, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.32910156, + "step": 1689, + "time_per_iteration": 2.4698379039764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074583, + "balance_loss_mlp": 1.04184794, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.06604183912716106, + "language_loss": 0.82445431, + "learning_rate": 0.0007884729392211603, + "loss": 0.83520007, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.32739258, + "step": 1690, + "time_per_iteration": 2.6488864421844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.04920113, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06849578130600678, + "language_loss": 0.85280114, + "learning_rate": 0.0007882184209712245, + "loss": 0.86361718, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.32397461, + "step": 1691, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080531, + "balance_loss_mlp": 1.04874992, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.06225581397596747, + "language_loss": 0.8573736, + "learning_rate": 0.000787963790824974, + "loss": 0.8681789, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.31762695, + "step": 1692, + "time_per_iteration": 2.9696617126464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06054115, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.0857009989212748, + "language_loss": 0.89660913, + "learning_rate": 0.0007877090488812651, + "loss": 0.90753233, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.31762695, + "step": 1693, + "time_per_iteration": 2.431861639022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_mlp": 1.05553031, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.07076453254267401, + "language_loss": 0.8368417, + "learning_rate": 0.0007874541952389973, + "loss": 0.84770912, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.31176758, + "step": 1694, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_mlp": 1.05293202, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.060562687008333366, + "language_loss": 0.86582285, + "learning_rate": 0.0007871992299971136, + "loss": 0.87666881, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.31640625, + "step": 1695, + "time_per_iteration": 2.553171396255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_mlp": 1.0608871, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.05969457295977618, + "language_loss": 0.84301764, + "learning_rate": 0.0007869441532546001, + "loss": 0.85394001, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.31323242, + "step": 1696, + "time_per_iteration": 2.752049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.06247652, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05927141137383595, + "language_loss": 0.79686946, + "learning_rate": 0.0007866889651104867, + "loss": 0.80780673, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.31225586, + "step": 1697, + "time_per_iteration": 2.7691686153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109533, + "balance_loss_mlp": 1.06388259, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.0715366482234757, + "language_loss": 0.83218181, + "learning_rate": 0.000786433665663846, + "loss": 0.84313512, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.31420898, + "step": 1698, + "time_per_iteration": 2.717372179031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098821, + "balance_loss_mlp": 1.06816053, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.05645489658390659, + "language_loss": 0.86431837, + "learning_rate": 0.0007861782550137942, + "loss": 0.87530661, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.30615234, + "step": 1699, + "time_per_iteration": 2.9035465717315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_mlp": 1.07394195, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.11170286971508382, + "language_loss": 0.85853553, + "learning_rate": 0.0007859227332594901, + "loss": 0.86957312, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.29785156, + "step": 1700, + "time_per_iteration": 2.9302797317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093978, + "balance_loss_mlp": 1.06508183, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.07200471053268022, + "language_loss": 0.84801477, + "learning_rate": 0.0007856671005001365, + "loss": 0.85895455, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.28881836, + "step": 1701, + "time_per_iteration": 3.1760013103485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090985, + "balance_loss_mlp": 1.06225514, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.07453437515979243, + "language_loss": 0.81870627, + "learning_rate": 0.0007854113568349787, + "loss": 0.82961613, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.28686523, + "step": 1702, + "time_per_iteration": 3.1038365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_mlp": 1.05770779, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.07528598974040544, + "language_loss": 0.80317354, + "learning_rate": 0.0007851555023633052, + "loss": 0.81405228, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.30102539, + "step": 1703, + "time_per_iteration": 2.847515106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.0558784, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.08040178147570827, + "language_loss": 0.82301831, + "learning_rate": 0.0007848995371844474, + "loss": 0.83387053, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.29296875, + "step": 1704, + "time_per_iteration": 2.5442426204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098029, + "balance_loss_mlp": 1.06872725, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06101842979524802, + "language_loss": 0.80441558, + "learning_rate": 0.0007846434613977801, + "loss": 0.81539583, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.29296875, + "step": 1705, + "time_per_iteration": 2.5023465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091561, + "balance_loss_mlp": 1.06242633, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.07007502801083235, + "language_loss": 0.78621399, + "learning_rate": 0.0007843872751027203, + "loss": 0.79712963, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.29125977, + "step": 1706, + "time_per_iteration": 2.790001392364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094895, + "balance_loss_mlp": 1.06549811, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.05836443006497643, + "language_loss": 0.87259293, + "learning_rate": 0.0007841309783987287, + "loss": 0.88354194, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.29345703, + "step": 1707, + "time_per_iteration": 2.7478153705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097713, + "balance_loss_mlp": 1.0684588, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.05888352709782848, + "language_loss": 0.89055538, + "learning_rate": 0.0007838745713853084, + "loss": 0.90153247, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.29199219, + "step": 1708, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088275, + "balance_loss_mlp": 1.05925906, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.06397878577513526, + "language_loss": 0.8386358, + "learning_rate": 0.0007836180541620053, + "loss": 0.8495186, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.29003906, + "step": 1709, + "time_per_iteration": 2.7023067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_mlp": 1.06191421, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.05521592697878337, + "language_loss": 0.86435962, + "learning_rate": 0.0007833614268284082, + "loss": 0.87527102, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.29199219, + "step": 1710, + "time_per_iteration": 2.538080930709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_mlp": 1.0721513, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.029520146980468998, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75200427, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.18457031, + "step": 1711, + "time_per_iteration": 4.909448862075806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05965161, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.07803051984240059, + "language_loss": 0.78501904, + "learning_rate": 0.0007828478422289016, + "loss": 0.79591095, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.29492188, + "step": 1712, + "time_per_iteration": 2.5883195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092173, + "balance_loss_mlp": 1.06210816, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05953292046858541, + "language_loss": 0.88987601, + "learning_rate": 0.0007825908851623833, + "loss": 0.90079772, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.30004883, + "step": 1713, + "time_per_iteration": 2.7441718578338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089127, + "balance_loss_mlp": 1.05973005, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06609176393308323, + "language_loss": 0.8478905, + "learning_rate": 0.0007823338183843533, + "loss": 0.85878181, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.29394531, + "step": 1714, + "time_per_iteration": 2.6771602630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.06291747, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.10875146541446083, + "language_loss": 0.80569458, + "learning_rate": 0.0007820766419946141, + "loss": 0.81661701, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.29321289, + "step": 1715, + "time_per_iteration": 3.3068225383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_mlp": 1.07052732, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.03503617860008252, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760461, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.17480469, + "step": 1716, + "time_per_iteration": 5.048320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_mlp": 1.06201911, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.06576145610663801, + "language_loss": 0.76379126, + "learning_rate": 0.0007815619607794288, + "loss": 0.77470231, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.29052734, + "step": 1717, + "time_per_iteration": 2.6151187419891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094733, + "balance_loss_mlp": 1.06440604, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.08930544150493325, + "language_loss": 0.82491159, + "learning_rate": 0.0007813044561538001, + "loss": 0.835859, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.30273438, + "step": 1718, + "time_per_iteration": 3.1329195499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089209, + "balance_loss_mlp": 1.05928707, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.06440748712139703, + "language_loss": 0.88832355, + "learning_rate": 0.0007810468423160958, + "loss": 0.8992157, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.29882812, + "step": 1719, + "time_per_iteration": 2.8785343170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_mlp": 1.06195092, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.05842798757545397, + "language_loss": 0.81825691, + "learning_rate": 0.0007807891193663306, + "loss": 0.82917207, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.29492188, + "step": 1720, + "time_per_iteration": 2.775949478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.05956948, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.1056737351826848, + "language_loss": 0.82154363, + "learning_rate": 0.0007805312874045614, + "loss": 0.83243477, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.29516602, + "step": 1721, + "time_per_iteration": 2.528573513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.06054103, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.06879892565652022, + "language_loss": 0.86894739, + "learning_rate": 0.0007802733465308874, + "loss": 0.87984586, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.29272461, + "step": 1722, + "time_per_iteration": 2.4575133323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.05811512, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.06801648197756033, + "language_loss": 0.84311831, + "learning_rate": 0.0007800152968454501, + "loss": 0.85398912, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.28930664, + "step": 1723, + "time_per_iteration": 2.729114294052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_mlp": 1.06300533, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.049597969001903774, + "language_loss": 0.90648681, + "learning_rate": 0.0007797571384484334, + "loss": 0.91740465, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.28759766, + "step": 1724, + "time_per_iteration": 2.8813512325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_mlp": 1.05463219, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.060917196813517045, + "language_loss": 0.91917408, + "learning_rate": 0.0007794988714400633, + "loss": 0.9300158, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.29516602, + "step": 1725, + "time_per_iteration": 2.6094837188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_mlp": 1.05896294, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.06883363868640566, + "language_loss": 0.85331756, + "learning_rate": 0.0007792404959206079, + "loss": 0.86420023, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.29272461, + "step": 1726, + "time_per_iteration": 2.4982993602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_mlp": 1.05396366, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.0595205364190525, + "language_loss": 0.81498575, + "learning_rate": 0.0007789820119903774, + "loss": 0.82581604, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.29052734, + "step": 1727, + "time_per_iteration": 2.9797775745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04043114, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.028746370774938412, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552454, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.19335938, + "step": 1728, + "time_per_iteration": 4.892562627792358 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_mlp": 1.05982828, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.10868743625457102, + "language_loss": 0.83712173, + "learning_rate": 0.0007784647192990428, + "loss": 0.84802401, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.3034668, + "step": 1729, + "time_per_iteration": 2.721163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093021, + "balance_loss_mlp": 1.06283677, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.06834187729314575, + "language_loss": 0.80591226, + "learning_rate": 0.0007782059107387696, + "loss": 0.81684244, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.30151367, + "step": 1730, + "time_per_iteration": 2.8358583450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097893, + "balance_loss_mlp": 1.06768548, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.06518025115488765, + "language_loss": 0.88646144, + "learning_rate": 0.0007779469941693826, + "loss": 0.89744031, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.30175781, + "step": 1731, + "time_per_iteration": 2.8069489002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105874, + "balance_loss_mlp": 1.0744741, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.0738487456517703, + "language_loss": 0.76712036, + "learning_rate": 0.0007776879696914029, + "loss": 0.77817911, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.3137207, + "step": 1732, + "time_per_iteration": 2.8068690299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08479202, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.06155067702851775, + "language_loss": 0.88390094, + "learning_rate": 0.000777428837405392, + "loss": 0.89506716, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.31811523, + "step": 1733, + "time_per_iteration": 2.8412673473358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_mlp": 1.07530773, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.0682339524169846, + "language_loss": 0.86804128, + "learning_rate": 0.0007771695974119544, + "loss": 0.87911332, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.31884766, + "step": 1734, + "time_per_iteration": 2.512354612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_mlp": 1.07159579, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.0845052703087739, + "language_loss": 0.75201118, + "learning_rate": 0.0007769102498117359, + "loss": 0.7630502, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.32299805, + "step": 1735, + "time_per_iteration": 3.107100248336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_mlp": 1.05777764, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.061332510780765306, + "language_loss": 0.79977, + "learning_rate": 0.000776650794705424, + "loss": 0.81067985, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33227539, + "step": 1736, + "time_per_iteration": 3.259875535964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092848, + "balance_loss_mlp": 1.06116199, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.05236613872795896, + "language_loss": 0.82229674, + "learning_rate": 0.0007763912321937483, + "loss": 0.83322519, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.31665039, + "step": 1737, + "time_per_iteration": 2.704059600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088373, + "balance_loss_mlp": 1.05506587, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.07890071498287932, + "language_loss": 0.82297349, + "learning_rate": 0.0007761315623774799, + "loss": 0.83385718, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33325195, + "step": 1738, + "time_per_iteration": 3.399148464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.0574522, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.09967891290955513, + "language_loss": 0.87632757, + "learning_rate": 0.0007758717853574313, + "loss": 0.88722181, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.31958008, + "step": 1739, + "time_per_iteration": 2.772089958190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103829, + "balance_loss_mlp": 1.0729773, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06672668023604937, + "language_loss": 0.90074134, + "learning_rate": 0.0007756119012344571, + "loss": 0.91177964, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.30810547, + "step": 1740, + "time_per_iteration": 2.5482232570648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_mlp": 1.07707, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.07840140242610649, + "language_loss": 0.84438574, + "learning_rate": 0.0007753519101094535, + "loss": 0.85546857, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.31176758, + "step": 1741, + "time_per_iteration": 2.749004602432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_mlp": 1.07173228, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.07002932741488781, + "language_loss": 0.86241812, + "learning_rate": 0.0007750918120833575, + "loss": 0.87343943, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.3034668, + "step": 1742, + "time_per_iteration": 2.600731611251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_mlp": 1.0753479, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.07258867640739639, + "language_loss": 0.87368989, + "learning_rate": 0.0007748316072571485, + "loss": 0.88474762, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.30395508, + "step": 1743, + "time_per_iteration": 2.7698371410369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_mlp": 1.07902408, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.05763877458348602, + "language_loss": 0.79041934, + "learning_rate": 0.0007745712957318467, + "loss": 0.80151671, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.30664062, + "step": 1744, + "time_per_iteration": 2.967310667037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_mlp": 1.07412386, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.052786515694630796, + "language_loss": 0.86410165, + "learning_rate": 0.0007743108776085141, + "loss": 0.87514448, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.30102539, + "step": 1745, + "time_per_iteration": 2.771803855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_mlp": 1.07049131, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.06089020802257528, + "language_loss": 0.82798052, + "learning_rate": 0.0007740503529882543, + "loss": 0.83900565, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.32006836, + "step": 1746, + "time_per_iteration": 2.805392026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095402, + "balance_loss_mlp": 1.064551, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.0569869068698716, + "language_loss": 0.90718448, + "learning_rate": 0.0007737897219722114, + "loss": 0.9181385, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.30810547, + "step": 1747, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.05970204, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.07943976371979472, + "language_loss": 0.80688596, + "learning_rate": 0.0007735289846615716, + "loss": 0.81779456, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.31152344, + "step": 1748, + "time_per_iteration": 2.6637260913848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094297, + "balance_loss_mlp": 1.06356478, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.06884386609789231, + "language_loss": 0.81979561, + "learning_rate": 0.0007732681411575621, + "loss": 0.83073854, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.30712891, + "step": 1749, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.0555166, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.052237930998467595, + "language_loss": 0.87234819, + "learning_rate": 0.0007730071915614514, + "loss": 0.88321906, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.31542969, + "step": 1750, + "time_per_iteration": 2.707857370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_mlp": 1.05896115, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.08336153438972979, + "language_loss": 0.88963622, + "learning_rate": 0.0007727461359745489, + "loss": 0.90053463, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.30859375, + "step": 1751, + "time_per_iteration": 2.482837438583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093668, + "balance_loss_mlp": 1.06307864, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05330176149069141, + "language_loss": 0.86016554, + "learning_rate": 0.0007724849744982056, + "loss": 0.87110221, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.30541992, + "step": 1752, + "time_per_iteration": 2.690420389175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097033, + "balance_loss_mlp": 1.06668198, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.0643678921459399, + "language_loss": 0.81981385, + "learning_rate": 0.0007722237072338131, + "loss": 0.8307842, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.30322266, + "step": 1753, + "time_per_iteration": 2.7154347896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097395, + "balance_loss_mlp": 1.06694901, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.07107791288081117, + "language_loss": 0.85213387, + "learning_rate": 0.0007719623342828046, + "loss": 0.8631078, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.30419922, + "step": 1754, + "time_per_iteration": 2.5009355545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109586, + "balance_loss_mlp": 1.06426978, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.06326183968549627, + "language_loss": 0.84134084, + "learning_rate": 0.000771700855746654, + "loss": 0.85229945, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.31567383, + "step": 1755, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082281, + "balance_loss_mlp": 1.05071473, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.06130822269954804, + "language_loss": 0.88395244, + "learning_rate": 0.0007714392717268763, + "loss": 0.89477527, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.31542969, + "step": 1756, + "time_per_iteration": 2.6147336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_mlp": 1.05219221, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.05731341996908033, + "language_loss": 0.86388242, + "learning_rate": 0.0007711775823250273, + "loss": 0.87471741, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.31298828, + "step": 1757, + "time_per_iteration": 2.5304934978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085861, + "balance_loss_mlp": 1.05455685, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.061357664780502266, + "language_loss": 0.83481395, + "learning_rate": 0.0007709157876427039, + "loss": 0.84567261, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.31274414, + "step": 1758, + "time_per_iteration": 3.1116981506347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_mlp": 1.04189849, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0592835704233285, + "language_loss": 0.85574573, + "learning_rate": 0.0007706538877815439, + "loss": 0.86648774, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.32299805, + "step": 1759, + "time_per_iteration": 2.635298728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_mlp": 1.04730105, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.04672826561746397, + "language_loss": 0.83449262, + "learning_rate": 0.0007703918828432259, + "loss": 0.84527004, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.30419922, + "step": 1760, + "time_per_iteration": 2.664783477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071091, + "balance_loss_mlp": 1.04023945, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.061026274734732225, + "language_loss": 0.88914752, + "learning_rate": 0.000770129772929469, + "loss": 0.89985847, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.30810547, + "step": 1761, + "time_per_iteration": 2.7082738876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_mlp": 1.03914273, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.058866792995701266, + "language_loss": 0.88234216, + "learning_rate": 0.0007698675581420334, + "loss": 0.89304519, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.3112793, + "step": 1762, + "time_per_iteration": 2.9119746685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.03966177, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.06738514708484569, + "language_loss": 0.78819811, + "learning_rate": 0.0007696052385827199, + "loss": 0.79890805, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.31298828, + "step": 1763, + "time_per_iteration": 2.9451980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107403, + "balance_loss_mlp": 1.04172421, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.0719800357998311, + "language_loss": 0.78192145, + "learning_rate": 0.00076934281435337, + "loss": 0.79266179, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.32299805, + "step": 1764, + "time_per_iteration": 2.8267600536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.03931201, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.06414673033674093, + "language_loss": 0.85701221, + "learning_rate": 0.0007690802855558658, + "loss": 0.86773127, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.32592773, + "step": 1765, + "time_per_iteration": 2.8825321197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_mlp": 1.04322386, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.027152559638010845, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.7743544, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.17285156, + "step": 1766, + "time_per_iteration": 4.890359401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04684353, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.06170687350837257, + "language_loss": 0.89089799, + "learning_rate": 0.0007685549146641262, + "loss": 0.90168703, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.32055664, + "step": 1767, + "time_per_iteration": 2.539238691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.04557216, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05571629344022593, + "language_loss": 0.8822673, + "learning_rate": 0.0007682920727738579, + "loss": 0.89303821, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.31494141, + "step": 1768, + "time_per_iteration": 2.512801170349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.04931498, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06175400371418068, + "language_loss": 0.8474735, + "learning_rate": 0.000768029126723369, + "loss": 0.85827971, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.31274414, + "step": 1769, + "time_per_iteration": 2.5238869190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075433, + "balance_loss_mlp": 1.04515338, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.06596681609056877, + "language_loss": 0.81544566, + "learning_rate": 0.0007677660766147447, + "loss": 0.82620001, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.30224609, + "step": 1770, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_mlp": 1.02063394, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.014856007486746849, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73508459, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.16894531, + "step": 1771, + "time_per_iteration": 4.967731475830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05113387, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.075322249241395, + "language_loss": 0.79792535, + "learning_rate": 0.0007672396646316306, + "loss": 0.8087405, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.30322266, + "step": 1772, + "time_per_iteration": 2.524365186691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_mlp": 1.05451918, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.05910937608565349, + "language_loss": 0.80291271, + "learning_rate": 0.000766976302961512, + "loss": 0.81376183, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.30371094, + "step": 1773, + "time_per_iteration": 3.002929925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_mlp": 1.0563519, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.0625889066862488, + "language_loss": 0.81081951, + "learning_rate": 0.0007667128376420003, + "loss": 0.82168746, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.30395508, + "step": 1774, + "time_per_iteration": 2.5821964740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_mlp": 1.05336761, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.06267075227744807, + "language_loss": 0.84329379, + "learning_rate": 0.0007664492687753817, + "loss": 0.85412979, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.30175781, + "step": 1775, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04769528, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.054581176728495925, + "language_loss": 0.81518859, + "learning_rate": 0.000766185596463983, + "loss": 0.8259607, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.29516602, + "step": 1776, + "time_per_iteration": 2.655543804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_mlp": 1.04993343, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.06969464274274284, + "language_loss": 0.76725864, + "learning_rate": 0.0007659218208101706, + "loss": 0.77804863, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.29003906, + "step": 1777, + "time_per_iteration": 3.1378567218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06411862, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.0529989301900612, + "language_loss": 0.84699291, + "learning_rate": 0.0007656579419163515, + "loss": 0.85792446, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.29052734, + "step": 1778, + "time_per_iteration": 2.8120994567871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091459, + "balance_loss_mlp": 1.06239629, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.06282493199141514, + "language_loss": 0.76994503, + "learning_rate": 0.0007653939598849724, + "loss": 0.78085959, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.2902832, + "step": 1779, + "time_per_iteration": 2.5995492935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.07051396, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.04507156484415478, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83967406, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16699219, + "step": 1780, + "time_per_iteration": 4.9175097942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_mlp": 1.07186341, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.05745476314946865, + "language_loss": 0.79740059, + "learning_rate": 0.000764865686819522, + "loss": 0.80842102, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.30151367, + "step": 1781, + "time_per_iteration": 3.1022064685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.06907511, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.061017866945560745, + "language_loss": 0.85627258, + "learning_rate": 0.0007646013959905449, + "loss": 0.8672511, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.28759766, + "step": 1782, + "time_per_iteration": 2.625312566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090603, + "balance_loss_mlp": 1.06030035, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05493462983431466, + "language_loss": 0.80768538, + "learning_rate": 0.0007643370024341949, + "loss": 0.81859136, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.30249023, + "step": 1783, + "time_per_iteration": 3.1206953525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_mlp": 1.06284761, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.04934338548004703, + "language_loss": 0.8289808, + "learning_rate": 0.0007640725062531195, + "loss": 0.83990133, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.29174805, + "step": 1784, + "time_per_iteration": 2.518277645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092006, + "balance_loss_mlp": 1.06165504, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.061838155255473454, + "language_loss": 0.8616311, + "learning_rate": 0.0007638079075500047, + "loss": 0.8725512, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.30297852, + "step": 1785, + "time_per_iteration": 2.566340684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056366, + "balance_loss_mlp": 1.04101145, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.03141321768780463, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76237035, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.15332031, + "step": 1786, + "time_per_iteration": 4.984891891479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_mlp": 1.05088782, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.0502662811310507, + "language_loss": 0.83153242, + "learning_rate": 0.0007632784029886026, + "loss": 0.84235144, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.30981445, + "step": 1787, + "time_per_iteration": 2.6574935913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_mlp": 1.04832625, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.058652751735253, + "language_loss": 0.85391539, + "learning_rate": 0.0007630134973358873, + "loss": 0.86470503, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.3059082, + "step": 1788, + "time_per_iteration": 2.920311450958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_mlp": 1.05702209, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05633660644162356, + "language_loss": 0.86888337, + "learning_rate": 0.0007627484895722763, + "loss": 0.87976426, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.31030273, + "step": 1789, + "time_per_iteration": 2.648061513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.05268025, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.08125120447961011, + "language_loss": 0.79987907, + "learning_rate": 0.0007624833798006552, + "loss": 0.8107022, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.29614258, + "step": 1790, + "time_per_iteration": 3.083303689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_mlp": 1.05249596, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.06337905919609309, + "language_loss": 0.83924425, + "learning_rate": 0.0007622181681239483, + "loss": 0.85006905, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.29931641, + "step": 1791, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_mlp": 1.04677427, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.05139164694864183, + "language_loss": 0.84563744, + "learning_rate": 0.0007619528546451202, + "loss": 0.85641772, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.31225586, + "step": 1792, + "time_per_iteration": 2.7847092151641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.05183685, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.060391852587241154, + "language_loss": 0.8357141, + "learning_rate": 0.0007616874394671745, + "loss": 0.84653878, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.3059082, + "step": 1793, + "time_per_iteration": 3.3427343368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05632687, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.07229882199780847, + "language_loss": 0.85033429, + "learning_rate": 0.0007614219226931547, + "loss": 0.86121154, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.3137207, + "step": 1794, + "time_per_iteration": 2.6797611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090025, + "balance_loss_mlp": 1.05931664, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.057715322830613675, + "language_loss": 0.84206641, + "learning_rate": 0.0007611563044261435, + "loss": 0.85296667, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.30664062, + "step": 1795, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_mlp": 1.05543017, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.06328741897936851, + "language_loss": 0.86560625, + "learning_rate": 0.0007608905847692631, + "loss": 0.87647337, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.3125, + "step": 1796, + "time_per_iteration": 2.472182035446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081946, + "balance_loss_mlp": 1.05014098, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.053847624873276365, + "language_loss": 0.86582637, + "learning_rate": 0.0007606247638256749, + "loss": 0.8766458, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.31787109, + "step": 1797, + "time_per_iteration": 2.842547655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147955, + "balance_loss_mlp": 1.13145602, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.06482996241123744, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79318249, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.16503906, + "step": 1798, + "time_per_iteration": 4.918993949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075567, + "balance_loss_mlp": 1.06011796, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.04230684388330953, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80402768, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.15429688, + "step": 1799, + "time_per_iteration": 4.791706323623657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.04724216, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.06124115711212235, + "language_loss": 0.85762143, + "learning_rate": 0.0007598266943068686, + "loss": 0.86839759, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.30322266, + "step": 1800, + "time_per_iteration": 2.743213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_mlp": 1.05266404, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.13184352245004016, + "language_loss": 0.83900499, + "learning_rate": 0.0007595604692488507, + "loss": 0.84984374, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31176758, + "step": 1801, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05105186, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.0617697315453188, + "language_loss": 0.82875979, + "learning_rate": 0.0007592941434205215, + "loss": 0.83958554, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.31494141, + "step": 1802, + "time_per_iteration": 2.803941488265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_mlp": 1.06292093, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.03209988868756776, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74648476, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.14453125, + "step": 1803, + "time_per_iteration": 5.115894794464111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073735, + "balance_loss_mlp": 1.04176331, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.057797440709038125, + "language_loss": 0.7980904, + "learning_rate": 0.0007587611898665566, + "loss": 0.80882776, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.31958008, + "step": 1804, + "time_per_iteration": 3.0783464908599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_mlp": 1.04958522, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.052922401600576395, + "language_loss": 0.8228178, + "learning_rate": 0.0007584945623478315, + "loss": 0.83362216, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.30810547, + "step": 1805, + "time_per_iteration": 2.8341996669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107388, + "balance_loss_mlp": 1.04178858, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.05986711270473425, + "language_loss": 0.81165981, + "learning_rate": 0.000758227834472617, + "loss": 0.82239866, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32080078, + "step": 1806, + "time_per_iteration": 3.0486085414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.04971278, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.06433807190471491, + "language_loss": 0.77163357, + "learning_rate": 0.0007579610063444664, + "loss": 0.78245926, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.32861328, + "step": 1807, + "time_per_iteration": 2.7597365379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073013, + "balance_loss_mlp": 1.04068375, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.06573509148212295, + "language_loss": 0.8740322, + "learning_rate": 0.0007576940780669712, + "loss": 0.88476229, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32324219, + "step": 1808, + "time_per_iteration": 3.2193737030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.04060304, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.07068655640298144, + "language_loss": 0.84018815, + "learning_rate": 0.0007574270497437624, + "loss": 0.85092652, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33251953, + "step": 1809, + "time_per_iteration": 2.958071708679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04255509, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.05267537563651592, + "language_loss": 0.88190216, + "learning_rate": 0.000757159921478509, + "loss": 0.89264333, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.31542969, + "step": 1810, + "time_per_iteration": 2.743820905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_mlp": 1.10993648, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.032772528197798495, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75575733, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.15136719, + "step": 1811, + "time_per_iteration": 4.734825372695923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_mlp": 1.04713607, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.06138203683055377, + "language_loss": 0.87334222, + "learning_rate": 0.0007566253655367423, + "loss": 0.88411689, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.30273438, + "step": 1812, + "time_per_iteration": 2.5963358879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.04946637, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.05073723218815133, + "language_loss": 0.89626348, + "learning_rate": 0.000756357938067762, + "loss": 0.90707672, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.31835938, + "step": 1813, + "time_per_iteration": 2.6791560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088512, + "balance_loss_mlp": 1.05615854, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.07107132576327291, + "language_loss": 0.82739902, + "learning_rate": 0.0007560904110718033, + "loss": 0.83828408, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32324219, + "step": 1814, + "time_per_iteration": 3.251187801361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05244136, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.056660731031110724, + "language_loss": 0.83390886, + "learning_rate": 0.0007558227846527297, + "loss": 0.84475422, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.32080078, + "step": 1815, + "time_per_iteration": 2.852786064147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086181, + "balance_loss_mlp": 1.05358887, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.06752757018776132, + "language_loss": 0.83192128, + "learning_rate": 0.0007555550589144429, + "loss": 0.84278309, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.32592773, + "step": 1816, + "time_per_iteration": 2.4226694107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108673, + "balance_loss_mlp": 1.05568814, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.05637535729014081, + "language_loss": 0.84440207, + "learning_rate": 0.000755287233960883, + "loss": 0.85526937, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.31005859, + "step": 1817, + "time_per_iteration": 2.556528329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081988, + "balance_loss_mlp": 1.04963493, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06861190177202381, + "language_loss": 0.77555025, + "learning_rate": 0.0007550193098960292, + "loss": 0.7863701, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32348633, + "step": 1818, + "time_per_iteration": 2.9168636798858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_mlp": 1.04902124, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.04890635253674866, + "language_loss": 0.85897982, + "learning_rate": 0.0007547512868238988, + "loss": 0.86979043, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.3203125, + "step": 1819, + "time_per_iteration": 3.147949695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_mlp": 1.05583739, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.07359678742691168, + "language_loss": 0.83527619, + "learning_rate": 0.0007544831648485473, + "loss": 0.84614623, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.3112793, + "step": 1820, + "time_per_iteration": 2.683906078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_mlp": 1.05272126, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.07119738396785501, + "language_loss": 0.81087327, + "learning_rate": 0.0007542149440740694, + "loss": 0.82171333, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.3125, + "step": 1821, + "time_per_iteration": 2.738029718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107983, + "balance_loss_mlp": 1.04850197, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.07229829340096756, + "language_loss": 0.8569001, + "learning_rate": 0.000753946624604597, + "loss": 0.86769843, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.31298828, + "step": 1822, + "time_per_iteration": 2.7263731956481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079169, + "balance_loss_mlp": 1.04795969, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.05660966900473529, + "language_loss": 0.87968546, + "learning_rate": 0.0007536782065443015, + "loss": 0.89047718, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.31176758, + "step": 1823, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_mlp": 1.05386138, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06227259781784348, + "language_loss": 0.74483079, + "learning_rate": 0.0007534096899973919, + "loss": 0.75567335, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.3034668, + "step": 1824, + "time_per_iteration": 2.609548807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_mlp": 1.04804349, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05520550621954613, + "language_loss": 0.82636261, + "learning_rate": 0.0007531410750681154, + "loss": 0.83715534, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.31201172, + "step": 1825, + "time_per_iteration": 2.7306325435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094474, + "balance_loss_mlp": 1.06352782, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.04890512262044313, + "language_loss": 0.86351258, + "learning_rate": 0.0007528723618607575, + "loss": 0.8744573, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.30908203, + "step": 1826, + "time_per_iteration": 3.4343338012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088582, + "balance_loss_mlp": 1.05782557, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.05382597898667073, + "language_loss": 0.82364488, + "learning_rate": 0.0007526035504796422, + "loss": 0.83453071, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.30737305, + "step": 1827, + "time_per_iteration": 2.7783889770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_mlp": 1.05721426, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.07196751046410012, + "language_loss": 0.86701363, + "learning_rate": 0.0007523346410291312, + "loss": 0.87790149, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.31542969, + "step": 1828, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096578, + "balance_loss_mlp": 1.06434393, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.05953464089235074, + "language_loss": 0.84491026, + "learning_rate": 0.0007520656336136245, + "loss": 0.85587609, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32226562, + "step": 1829, + "time_per_iteration": 2.9498770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095972, + "balance_loss_mlp": 1.0648104, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.05500553487662277, + "language_loss": 0.87983966, + "learning_rate": 0.0007517965283375599, + "loss": 0.89079928, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.3112793, + "step": 1830, + "time_per_iteration": 2.838120698928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097926, + "balance_loss_mlp": 1.06566763, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.053691241766720514, + "language_loss": 0.89336729, + "learning_rate": 0.0007515273253054132, + "loss": 0.90434659, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32250977, + "step": 1831, + "time_per_iteration": 2.6600866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092956, + "balance_loss_mlp": 1.06191444, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.05928754583625919, + "language_loss": 0.82674569, + "learning_rate": 0.0007512580246216988, + "loss": 0.83767527, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.31005859, + "step": 1832, + "time_per_iteration": 2.7806639671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089641, + "balance_loss_mlp": 1.05752611, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.0631616677310412, + "language_loss": 0.84810489, + "learning_rate": 0.000750988626390968, + "loss": 0.85900134, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32104492, + "step": 1833, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087885, + "balance_loss_mlp": 1.0560801, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.053730319302775706, + "language_loss": 0.84857321, + "learning_rate": 0.0007507191307178108, + "loss": 0.85945207, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.31787109, + "step": 1834, + "time_per_iteration": 2.822472095489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05785227, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.07238185360826516, + "language_loss": 0.74172056, + "learning_rate": 0.0007504495377068543, + "loss": 0.75260878, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.30932617, + "step": 1835, + "time_per_iteration": 2.758622884750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.06250441, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06860617015764896, + "language_loss": 0.81217551, + "learning_rate": 0.0007501798474627642, + "loss": 0.82311678, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.31591797, + "step": 1836, + "time_per_iteration": 2.932610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.06568563, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.06442397939494823, + "language_loss": 0.83527768, + "learning_rate": 0.0007499100600902433, + "loss": 0.8462323, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.29736328, + "step": 1837, + "time_per_iteration": 3.0089991092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089306, + "balance_loss_mlp": 1.05845428, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06893251529793973, + "language_loss": 0.83798671, + "learning_rate": 0.0007496401756940324, + "loss": 0.84887969, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.30810547, + "step": 1838, + "time_per_iteration": 2.6746418476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.06029606, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.06403380726847299, + "language_loss": 0.82561135, + "learning_rate": 0.0007493701943789098, + "loss": 0.83651948, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.3046875, + "step": 1839, + "time_per_iteration": 2.7678062915802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092399, + "balance_loss_mlp": 1.06307316, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.057234368489623245, + "language_loss": 0.82641804, + "learning_rate": 0.000749100116249692, + "loss": 0.83734202, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.29272461, + "step": 1840, + "time_per_iteration": 2.6124982833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_mlp": 1.0616498, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.09225915028059628, + "language_loss": 0.86273944, + "learning_rate": 0.0007488299414112321, + "loss": 0.87365901, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.30249023, + "step": 1841, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087223, + "balance_loss_mlp": 1.05737281, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.0557731038759208, + "language_loss": 0.77796137, + "learning_rate": 0.0007485596699684215, + "loss": 0.78883362, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.2980957, + "step": 1842, + "time_per_iteration": 2.83414626121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_mlp": 1.05561948, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.04938820360777142, + "language_loss": 0.85113978, + "learning_rate": 0.000748289302026189, + "loss": 0.86201257, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.31640625, + "step": 1843, + "time_per_iteration": 2.8805251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_mlp": 1.05403841, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06499404847276229, + "language_loss": 0.85830677, + "learning_rate": 0.0007480188376895004, + "loss": 0.86915159, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.30395508, + "step": 1844, + "time_per_iteration": 3.0965142250061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_mlp": 1.04624832, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.026974392702602535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74874085, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.16503906, + "step": 1845, + "time_per_iteration": 5.003226280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.05738342, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.11496133406812095, + "language_loss": 0.78570682, + "learning_rate": 0.0007474776202528074, + "loss": 0.79659295, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.31201172, + "step": 1846, + "time_per_iteration": 2.9579098224639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089072, + "balance_loss_mlp": 1.05736208, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.06294098896241457, + "language_loss": 0.81369591, + "learning_rate": 0.000747206867362922, + "loss": 0.82458663, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.31689453, + "step": 1847, + "time_per_iteration": 3.0886905193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109789, + "balance_loss_mlp": 1.06656218, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.060378794046525276, + "language_loss": 0.83593512, + "learning_rate": 0.0007469360184988194, + "loss": 0.84691405, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.31298828, + "step": 1848, + "time_per_iteration": 2.861438512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109845, + "balance_loss_mlp": 1.06724131, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.06250375704468988, + "language_loss": 0.86663848, + "learning_rate": 0.0007466650737656518, + "loss": 0.87762296, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.31176758, + "step": 1849, + "time_per_iteration": 2.620384454727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098996, + "balance_loss_mlp": 1.06754851, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05619364173691644, + "language_loss": 0.90150386, + "learning_rate": 0.0007463940332686098, + "loss": 0.91249382, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.31420898, + "step": 1850, + "time_per_iteration": 2.499337911605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_mlp": 1.06711888, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.05220134930851383, + "language_loss": 0.8454684, + "learning_rate": 0.0007461228971129205, + "loss": 0.85644454, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.30444336, + "step": 1851, + "time_per_iteration": 2.91583251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_mlp": 1.06049538, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.06507053577711389, + "language_loss": 0.85374135, + "learning_rate": 0.0007458516654038483, + "loss": 0.8646493, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.30297852, + "step": 1852, + "time_per_iteration": 2.710845947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06221175, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.055267605083424515, + "language_loss": 0.86826843, + "learning_rate": 0.0007455803382466946, + "loss": 0.87919998, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.30908203, + "step": 1853, + "time_per_iteration": 2.8157601356506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089896, + "balance_loss_mlp": 1.05894923, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.06143674576014299, + "language_loss": 0.87150055, + "learning_rate": 0.0007453089157467979, + "loss": 0.8823995, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.30908203, + "step": 1854, + "time_per_iteration": 2.7985024452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_mlp": 1.06946826, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.06203911404438901, + "language_loss": 0.82222199, + "learning_rate": 0.0007450373980095341, + "loss": 0.83323234, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.31542969, + "step": 1855, + "time_per_iteration": 3.0960283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_mlp": 1.07108843, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.05169641299516589, + "language_loss": 0.86845142, + "learning_rate": 0.0007447657851403155, + "loss": 0.87946558, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.30322266, + "step": 1856, + "time_per_iteration": 2.6420810222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106839, + "balance_loss_mlp": 1.07689333, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.07027910399075639, + "language_loss": 0.78771162, + "learning_rate": 0.0007444940772445915, + "loss": 0.79878008, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.29907227, + "step": 1857, + "time_per_iteration": 2.748770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109389, + "balance_loss_mlp": 1.06420684, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.057407361829253975, + "language_loss": 0.80228555, + "learning_rate": 0.0007442222744278484, + "loss": 0.81322443, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.29663086, + "step": 1858, + "time_per_iteration": 2.652111530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094475, + "balance_loss_mlp": 1.06410074, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.045384089682170406, + "language_loss": 0.8399753, + "learning_rate": 0.0007439503767956099, + "loss": 0.85092002, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.30371094, + "step": 1859, + "time_per_iteration": 2.703261375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03111064, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.02493030642290896, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80715972, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.1328125, + "step": 1860, + "time_per_iteration": 4.983760833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092897, + "balance_loss_mlp": 1.06242704, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.05045998946960442, + "language_loss": 0.85959804, + "learning_rate": 0.000743406297506922, + "loss": 0.87052703, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.30419922, + "step": 1861, + "time_per_iteration": 2.740078926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090008, + "balance_loss_mlp": 1.05956221, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.05968554082553822, + "language_loss": 0.8392486, + "learning_rate": 0.0007431341160617031, + "loss": 0.85014868, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.30395508, + "step": 1862, + "time_per_iteration": 2.8886373043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076671, + "balance_loss_mlp": 1.04631984, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.053643840261235066, + "language_loss": 0.88015211, + "learning_rate": 0.0007428618402234491, + "loss": 0.89091879, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.30297852, + "step": 1863, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04334283, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.062332671108041963, + "language_loss": 0.80358481, + "learning_rate": 0.0007425894700978668, + "loss": 0.81432676, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.30810547, + "step": 1864, + "time_per_iteration": 2.7334656715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072556, + "balance_loss_mlp": 1.04101336, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.050645747658019255, + "language_loss": 0.79510379, + "learning_rate": 0.0007423170057906996, + "loss": 0.80582935, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.31542969, + "step": 1865, + "time_per_iteration": 3.8669073581695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076041, + "balance_loss_mlp": 1.04452205, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06345597879427126, + "language_loss": 0.86289865, + "learning_rate": 0.0007420444474077275, + "loss": 0.87365907, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.31518555, + "step": 1866, + "time_per_iteration": 2.5648367404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080689, + "balance_loss_mlp": 1.04878831, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.058480526362169126, + "language_loss": 0.89744091, + "learning_rate": 0.0007417717950547671, + "loss": 0.90824777, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.31884766, + "step": 1867, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074714, + "balance_loss_mlp": 1.0600276, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.04131149216661822, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77071321, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.14648438, + "step": 1868, + "time_per_iteration": 4.900072813034058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.06035757, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.04948067344873762, + "language_loss": 0.84714514, + "learning_rate": 0.0007412262088623299, + "loss": 0.85806173, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.31274414, + "step": 1869, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_mlp": 1.06255615, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.0631690153505957, + "language_loss": 0.79514921, + "learning_rate": 0.0007409532752346684, + "loss": 0.80607969, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.30444336, + "step": 1870, + "time_per_iteration": 2.646813154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05436683, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.05200384527654752, + "language_loss": 0.88430232, + "learning_rate": 0.0007406802480606491, + "loss": 0.89514613, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.29956055, + "step": 1871, + "time_per_iteration": 2.6335039138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088571, + "balance_loss_mlp": 1.05819631, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.058340376963862656, + "language_loss": 0.90469301, + "learning_rate": 0.0007404071274462707, + "loss": 0.91557872, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.3034668, + "step": 1872, + "time_per_iteration": 2.579155206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088392, + "balance_loss_mlp": 1.05911398, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06288764850432389, + "language_loss": 0.83945811, + "learning_rate": 0.0007401339134975682, + "loss": 0.85034204, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.29272461, + "step": 1873, + "time_per_iteration": 2.6590254306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089736, + "balance_loss_mlp": 1.06024313, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.07025897777145818, + "language_loss": 0.84501064, + "learning_rate": 0.0007398606063206122, + "loss": 0.85590804, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.29467773, + "step": 1874, + "time_per_iteration": 2.6330654621124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_mlp": 1.05545354, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05525815693458704, + "language_loss": 0.78668261, + "learning_rate": 0.0007395872060215101, + "loss": 0.79753017, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.29296875, + "step": 1875, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_mlp": 1.05853248, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.05566722247490556, + "language_loss": 0.88191175, + "learning_rate": 0.0007393137127064056, + "loss": 0.89278299, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.28588867, + "step": 1876, + "time_per_iteration": 2.67520809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_mlp": 1.05479455, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05183280051917729, + "language_loss": 0.84175742, + "learning_rate": 0.0007390401264814779, + "loss": 0.85258996, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.28491211, + "step": 1877, + "time_per_iteration": 2.621708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05559897, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.059598774698536174, + "language_loss": 0.84762645, + "learning_rate": 0.0007387664474529427, + "loss": 0.85846466, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.28222656, + "step": 1878, + "time_per_iteration": 2.64604115486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_mlp": 1.0567776, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.05278661870548292, + "language_loss": 0.90893793, + "learning_rate": 0.0007384926757270518, + "loss": 0.91979533, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.28955078, + "step": 1879, + "time_per_iteration": 2.63849139213562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094605, + "balance_loss_mlp": 1.0652554, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.05095981973878578, + "language_loss": 0.79965544, + "learning_rate": 0.0007382188114100924, + "loss": 0.81060153, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.29296875, + "step": 1880, + "time_per_iteration": 2.967137098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_mlp": 1.06731534, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.0523610100033388, + "language_loss": 0.81541228, + "learning_rate": 0.0007379448546083884, + "loss": 0.82638228, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.29663086, + "step": 1881, + "time_per_iteration": 2.935075283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089574, + "balance_loss_mlp": 1.06036723, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.056326792126263736, + "language_loss": 0.88131809, + "learning_rate": 0.0007376708054282992, + "loss": 0.89221382, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.29174805, + "step": 1882, + "time_per_iteration": 2.9548256397247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080549, + "balance_loss_mlp": 1.05074644, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.053377968629185854, + "language_loss": 0.8395232, + "learning_rate": 0.0007373966639762201, + "loss": 0.85032874, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.29785156, + "step": 1883, + "time_per_iteration": 2.5978147983551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_mlp": 1.05085516, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.055969169447774005, + "language_loss": 0.88542271, + "learning_rate": 0.0007371224303585822, + "loss": 0.8962214, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.29003906, + "step": 1884, + "time_per_iteration": 2.573521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122192, + "balance_loss_mlp": 1.10817313, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.05390094690370155, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81479263, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.140625, + "step": 1885, + "time_per_iteration": 4.762617826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077599, + "balance_loss_mlp": 1.04722452, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05279204841925659, + "language_loss": 0.8277564, + "learning_rate": 0.0007365736870525335, + "loss": 0.83853239, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.30322266, + "step": 1886, + "time_per_iteration": 2.8206799030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071958, + "balance_loss_mlp": 1.04182231, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.0631822735743998, + "language_loss": 0.82252121, + "learning_rate": 0.000736299177577164, + "loss": 0.83324087, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.30102539, + "step": 1887, + "time_per_iteration": 2.5644423961639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075611, + "balance_loss_mlp": 1.04516482, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.06952119877485304, + "language_loss": 0.83928037, + "learning_rate": 0.0007360245763623174, + "loss": 0.8500365, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.30395508, + "step": 1888, + "time_per_iteration": 2.68868088722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_mlp": 1.04614949, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.05500458280543127, + "language_loss": 0.89759338, + "learning_rate": 0.0007357498835146039, + "loss": 0.90835977, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.30444336, + "step": 1889, + "time_per_iteration": 2.841135263442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078037, + "balance_loss_mlp": 1.04716182, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.05518095134274227, + "language_loss": 0.86945391, + "learning_rate": 0.0007354750991406684, + "loss": 0.8802343, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.30834961, + "step": 1890, + "time_per_iteration": 2.6954762935638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04810333, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.060964398763012274, + "language_loss": 0.80524838, + "learning_rate": 0.0007352002233471919, + "loss": 0.81604487, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.31518555, + "step": 1891, + "time_per_iteration": 2.6167404651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04973292, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.06807309201777603, + "language_loss": 0.79092562, + "learning_rate": 0.0007349252562408906, + "loss": 0.80172026, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.296875, + "step": 1892, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091379, + "balance_loss_mlp": 1.06071806, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.05563142804906438, + "language_loss": 0.81399196, + "learning_rate": 0.0007346501979285158, + "loss": 0.82490575, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.30615234, + "step": 1893, + "time_per_iteration": 2.8852903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_mlp": 1.06208813, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02944776437417564, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8161397, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.12792969, + "step": 1894, + "time_per_iteration": 4.784174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114227, + "balance_loss_mlp": 1.0819447, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.051755500006301046, + "language_loss": 0.8558799, + "learning_rate": 0.0007340998081127308, + "loss": 0.86702216, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.32275391, + "step": 1895, + "time_per_iteration": 2.807494878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121943, + "balance_loss_mlp": 1.09023345, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.06567695066031824, + "language_loss": 0.90748346, + "learning_rate": 0.0007338244768230007, + "loss": 0.9187029, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.31689453, + "step": 1896, + "time_per_iteration": 2.7678794860839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118221, + "balance_loss_mlp": 1.08694077, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.07782470610585689, + "language_loss": 0.8913762, + "learning_rate": 0.0007335490547545578, + "loss": 0.90255845, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.3125, + "step": 1897, + "time_per_iteration": 3.0801138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112607, + "balance_loss_mlp": 1.0822562, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.05264242736204855, + "language_loss": 0.82653165, + "learning_rate": 0.0007332735420143308, + "loss": 0.83765769, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.30297852, + "step": 1898, + "time_per_iteration": 2.7581489086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094572, + "balance_loss_mlp": 1.06338716, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.06387883695900265, + "language_loss": 0.8681283, + "learning_rate": 0.0007329979387092826, + "loss": 0.87907398, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.31152344, + "step": 1899, + "time_per_iteration": 2.586489677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.05964673, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.054083416077733606, + "language_loss": 0.83626556, + "learning_rate": 0.0007327222449464124, + "loss": 0.84716845, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.3059082, + "step": 1900, + "time_per_iteration": 3.2495076656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_mlp": 1.0518986, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.05500564094416643, + "language_loss": 0.88598847, + "learning_rate": 0.0007324464608327538, + "loss": 0.89683151, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.32397461, + "step": 1901, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079363, + "balance_loss_mlp": 1.04786777, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.0538418205513684, + "language_loss": 0.88291639, + "learning_rate": 0.0007321705864753758, + "loss": 0.89371002, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.31469727, + "step": 1902, + "time_per_iteration": 2.69343638420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04294717, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.056477009868628435, + "language_loss": 0.84098166, + "learning_rate": 0.0007318946219813823, + "loss": 0.85172582, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.31469727, + "step": 1903, + "time_per_iteration": 3.010847568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04232407, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05768945263904951, + "language_loss": 0.89714533, + "learning_rate": 0.000731618567457912, + "loss": 0.90789449, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.32592773, + "step": 1904, + "time_per_iteration": 2.6410703659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076588, + "balance_loss_mlp": 1.0440681, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05570087619571841, + "language_loss": 0.86445332, + "learning_rate": 0.000731342423012139, + "loss": 0.87521917, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.32519531, + "step": 1905, + "time_per_iteration": 3.054703712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.04312992, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.05663901457074664, + "language_loss": 0.82393479, + "learning_rate": 0.0007310661887512722, + "loss": 0.83468342, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.31713867, + "step": 1906, + "time_per_iteration": 3.0096654891967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076944, + "balance_loss_mlp": 1.04532969, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.07427377535541638, + "language_loss": 0.8207258, + "learning_rate": 0.0007307898647825549, + "loss": 0.83149529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.31591797, + "step": 1907, + "time_per_iteration": 2.67525315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04347432, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.07021562329929035, + "language_loss": 0.89152002, + "learning_rate": 0.0007305134512132659, + "loss": 0.90227735, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.32250977, + "step": 1908, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0476923, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.07878350898766671, + "language_loss": 0.83255082, + "learning_rate": 0.0007302369481507183, + "loss": 0.84334129, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.31323242, + "step": 1909, + "time_per_iteration": 2.5106606483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108859, + "balance_loss_mlp": 1.09207463, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.039316944601114644, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.8107062, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.16796875, + "step": 1910, + "time_per_iteration": 4.845642566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_mlp": 1.04287899, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.05282525969479425, + "language_loss": 0.8551507, + "learning_rate": 0.000729683673975274, + "loss": 0.86588871, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.30883789, + "step": 1911, + "time_per_iteration": 2.643991470336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077837, + "balance_loss_mlp": 1.04648542, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.06579029503933971, + "language_loss": 0.83071077, + "learning_rate": 0.0007294069030771774, + "loss": 0.84148908, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.31323242, + "step": 1912, + "time_per_iteration": 3.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081127, + "balance_loss_mlp": 1.05053759, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055639286508135585, + "language_loss": 0.90529931, + "learning_rate": 0.0007291300431154224, + "loss": 0.91611063, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.30541992, + "step": 1913, + "time_per_iteration": 2.6364145278930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020102, + "balance_loss_mlp": 1.00503433, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.014819520409209537, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71409839, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.15039062, + "step": 1914, + "time_per_iteration": 4.986552000045776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089166, + "balance_loss_mlp": 1.05895889, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.07166131614104637, + "language_loss": 0.80129957, + "learning_rate": 0.0007285760564309179, + "loss": 0.81219125, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.30151367, + "step": 1915, + "time_per_iteration": 3.105180025100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.05362058, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.07315246202889085, + "language_loss": 0.85023272, + "learning_rate": 0.0007282989299232448, + "loss": 0.86106199, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.29272461, + "step": 1916, + "time_per_iteration": 3.0501549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_mlp": 1.05710506, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.0682472178493412, + "language_loss": 0.83468378, + "learning_rate": 0.0007280217147820668, + "loss": 0.84554267, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.28735352, + "step": 1917, + "time_per_iteration": 2.61570143699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.06836295, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.06368361877082852, + "language_loss": 0.79183483, + "learning_rate": 0.0007277444111150079, + "loss": 0.80280429, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.28613281, + "step": 1918, + "time_per_iteration": 2.7004950046539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_mlp": 1.06124449, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.07280537378335762, + "language_loss": 0.84052753, + "learning_rate": 0.0007274670190297272, + "loss": 0.85142708, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.28710938, + "step": 1919, + "time_per_iteration": 2.598128080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06902122, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.05243134255501039, + "language_loss": 0.82081646, + "learning_rate": 0.0007271895386339179, + "loss": 0.83180475, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.29736328, + "step": 1920, + "time_per_iteration": 2.7843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093148, + "balance_loss_mlp": 1.06360769, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.058714378397154585, + "language_loss": 0.83102447, + "learning_rate": 0.0007269119700353073, + "loss": 0.8419559, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.29492188, + "step": 1921, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_mlp": 1.06052053, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04695414461356542, + "language_loss": 0.84780574, + "learning_rate": 0.0007266343133416571, + "loss": 0.85869944, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.28833008, + "step": 1922, + "time_per_iteration": 2.779585361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065569, + "balance_loss_mlp": 1.05011928, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.04139595668748732, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78182483, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.15429688, + "step": 1923, + "time_per_iteration": 4.841213703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_mlp": 1.05591547, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.07673769099321799, + "language_loss": 0.84293365, + "learning_rate": 0.0007260787361004556, + "loss": 0.85378897, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.2956543, + "step": 1924, + "time_per_iteration": 2.5501017570495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_mlp": 1.00875258, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.01226438472350035, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74784565, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.14257812, + "step": 1925, + "time_per_iteration": 4.9058191776275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05040073, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.0733591012555623, + "language_loss": 0.87266588, + "learning_rate": 0.0007255228077730903, + "loss": 0.88345671, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.28686523, + "step": 1926, + "time_per_iteration": 2.6776785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080805, + "balance_loss_mlp": 1.05281413, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.05143591599053885, + "language_loss": 0.81313562, + "learning_rate": 0.0007252447122218632, + "loss": 0.82394373, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.2800293, + "step": 1927, + "time_per_iteration": 3.1710472106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_mlp": 1.04907489, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.07597924069729044, + "language_loss": 0.88653511, + "learning_rate": 0.0007249665292228834, + "loss": 0.89731288, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.28686523, + "step": 1928, + "time_per_iteration": 2.580092191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108352, + "balance_loss_mlp": 1.0547905, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.05796370091963761, + "language_loss": 0.8379482, + "learning_rate": 0.000724688258884151, + "loss": 0.84878337, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.28710938, + "step": 1929, + "time_per_iteration": 2.6322267055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_mlp": 1.05740142, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.049384577339976525, + "language_loss": 0.86327779, + "learning_rate": 0.0007244099013137002, + "loss": 0.87413883, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.28710938, + "step": 1930, + "time_per_iteration": 3.09224009513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_mlp": 1.05951214, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.06129670734370297, + "language_loss": 0.88767004, + "learning_rate": 0.0007241314566195993, + "loss": 0.89854914, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.28393555, + "step": 1931, + "time_per_iteration": 3.238381862640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094186, + "balance_loss_mlp": 1.06531322, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.05545779345638414, + "language_loss": 0.85434037, + "learning_rate": 0.0007238529249099496, + "loss": 0.86528224, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.28833008, + "step": 1932, + "time_per_iteration": 2.632279872894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159138, + "balance_loss_mlp": 1.1475507, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.054961579821259376, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79016018, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.11572266, + "step": 1933, + "time_per_iteration": 4.920037746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098131, + "balance_loss_mlp": 1.06902027, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.06411393233522368, + "language_loss": 0.80432916, + "learning_rate": 0.000723295600876581, + "loss": 0.81531054, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.29101562, + "step": 1934, + "time_per_iteration": 3.060438632965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_mlp": 1.06510615, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.054125512250282885, + "language_loss": 0.87856102, + "learning_rate": 0.0007230168087692344, + "loss": 0.88949579, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.28393555, + "step": 1935, + "time_per_iteration": 2.655176877975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095042, + "balance_loss_mlp": 1.06607461, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.053712544631880174, + "language_loss": 0.82501912, + "learning_rate": 0.0007227379300790839, + "loss": 0.83596957, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.28955078, + "step": 1936, + "time_per_iteration": 3.05722713470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_mlp": 1.05668318, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.05452705072121448, + "language_loss": 0.85148442, + "learning_rate": 0.0007224589649143997, + "loss": 0.86234665, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.29492188, + "step": 1937, + "time_per_iteration": 2.593818187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06021869, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08689315573767935, + "language_loss": 0.80660325, + "learning_rate": 0.0007221799133834861, + "loss": 0.81749392, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.28833008, + "step": 1938, + "time_per_iteration": 2.6238772869110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087089, + "balance_loss_mlp": 1.05869377, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.06550449761554421, + "language_loss": 0.81904262, + "learning_rate": 0.00072190077559468, + "loss": 0.8299135, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.28417969, + "step": 1939, + "time_per_iteration": 2.5338878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_mlp": 1.05649543, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.05171807924061888, + "language_loss": 0.89000612, + "learning_rate": 0.0007216215516563527, + "loss": 0.90086764, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.29589844, + "step": 1940, + "time_per_iteration": 2.717912435531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_mlp": 1.05449796, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.06398735943962416, + "language_loss": 0.83462608, + "learning_rate": 0.0007213422416769083, + "loss": 0.84545934, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.28808594, + "step": 1941, + "time_per_iteration": 2.6354072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107949, + "balance_loss_mlp": 1.0511179, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05310409823342424, + "language_loss": 0.75118601, + "learning_rate": 0.0007210628457647849, + "loss": 0.76198089, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.28369141, + "step": 1942, + "time_per_iteration": 2.573251724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080746, + "balance_loss_mlp": 1.05118251, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.05561530112530558, + "language_loss": 0.78689432, + "learning_rate": 0.000720783364028453, + "loss": 0.79770184, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.29516602, + "step": 1943, + "time_per_iteration": 2.782897472381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078848, + "balance_loss_mlp": 1.04935515, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05583674557333592, + "language_loss": 0.87426305, + "learning_rate": 0.0007205037965764177, + "loss": 0.88505149, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.29467773, + "step": 1944, + "time_per_iteration": 2.577195167541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_mlp": 1.04740369, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05970518460248593, + "language_loss": 0.8568424, + "learning_rate": 0.0007202241435172161, + "loss": 0.86760962, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.29296875, + "step": 1945, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04849827, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.057784843601785166, + "language_loss": 0.88219595, + "learning_rate": 0.0007199444049594198, + "loss": 0.89296943, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.28833008, + "step": 1946, + "time_per_iteration": 2.997744560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075997, + "balance_loss_mlp": 1.04681468, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.05996621635377081, + "language_loss": 0.83343232, + "learning_rate": 0.0007196645810116322, + "loss": 0.84419227, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.29150391, + "step": 1947, + "time_per_iteration": 2.6596434116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071198, + "balance_loss_mlp": 1.04308891, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.07792528533349045, + "language_loss": 0.8387686, + "learning_rate": 0.0007193846717824912, + "loss": 0.84948057, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.28149414, + "step": 1948, + "time_per_iteration": 2.87357759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04031014, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06284621907245236, + "language_loss": 0.88014293, + "learning_rate": 0.0007191046773806669, + "loss": 0.89082038, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.27514648, + "step": 1949, + "time_per_iteration": 2.616118907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073776, + "balance_loss_mlp": 1.04473686, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06080214721481266, + "language_loss": 0.83072305, + "learning_rate": 0.0007188245979148631, + "loss": 0.84146082, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.29003906, + "step": 1950, + "time_per_iteration": 3.212918281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05164886, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.06034460157863772, + "language_loss": 0.87560785, + "learning_rate": 0.0007185444334938157, + "loss": 0.88641185, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.28735352, + "step": 1951, + "time_per_iteration": 2.6847927570343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_mlp": 1.04635811, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.07362347851216991, + "language_loss": 0.85023165, + "learning_rate": 0.0007182641842262947, + "loss": 0.86097872, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.28320312, + "step": 1952, + "time_per_iteration": 2.6011481285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080682, + "balance_loss_mlp": 1.05252457, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.05143100601063952, + "language_loss": 0.77525514, + "learning_rate": 0.0007179838502211022, + "loss": 0.78606194, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.28198242, + "step": 1953, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.05487227, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.06528688845841664, + "language_loss": 0.86487108, + "learning_rate": 0.0007177034315870738, + "loss": 0.87569952, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.27978516, + "step": 1954, + "time_per_iteration": 2.9551377296447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04896057, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.059767476828271, + "language_loss": 0.90968794, + "learning_rate": 0.0007174229284330773, + "loss": 0.9204582, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.28076172, + "step": 1955, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.0481143, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.06317358450106399, + "language_loss": 0.87043428, + "learning_rate": 0.0007171423408680141, + "loss": 0.88119459, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.27954102, + "step": 1956, + "time_per_iteration": 2.8243377208709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.04352272, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.057758823731725896, + "language_loss": 0.89565909, + "learning_rate": 0.0007168616690008176, + "loss": 0.90638542, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.29125977, + "step": 1957, + "time_per_iteration": 2.6314306259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_mlp": 1.04572916, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.055146864479517985, + "language_loss": 0.86279052, + "learning_rate": 0.0007165809129404545, + "loss": 0.87353098, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.28320312, + "step": 1958, + "time_per_iteration": 2.7625439167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074993, + "balance_loss_mlp": 1.044595, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.06141204693847206, + "language_loss": 0.85977095, + "learning_rate": 0.0007163000727959239, + "loss": 0.87052089, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.30371094, + "step": 1959, + "time_per_iteration": 2.473407506942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061387, + "balance_loss_mlp": 1.04622388, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.02935416999593297, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79020452, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.15136719, + "step": 1960, + "time_per_iteration": 4.8784215450286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_mlp": 1.04973722, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.05722982355969982, + "language_loss": 0.84446192, + "learning_rate": 0.00071573814069052, + "loss": 0.85525477, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.29541016, + "step": 1961, + "time_per_iteration": 2.929955244064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_mlp": 1.05031538, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.053564242831421076, + "language_loss": 0.88053226, + "learning_rate": 0.0007154570489478081, + "loss": 0.8913213, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.28540039, + "step": 1962, + "time_per_iteration": 3.1691505908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079242, + "balance_loss_mlp": 1.05001187, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.05213464978332433, + "language_loss": 0.86570239, + "learning_rate": 0.0007151758735572514, + "loss": 0.87649477, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.29174805, + "step": 1963, + "time_per_iteration": 2.9893381595611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_mlp": 1.05190408, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06256473208381459, + "language_loss": 0.80730724, + "learning_rate": 0.0007148946146280119, + "loss": 0.81811094, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.28442383, + "step": 1964, + "time_per_iteration": 2.8270015716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_mlp": 1.00214851, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.01808471901321765, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73207271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12988281, + "step": 1965, + "time_per_iteration": 4.895836353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018206, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.021930840707602553, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76360154, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.12597656, + "step": 1966, + "time_per_iteration": 5.0023956298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091314, + "balance_loss_mlp": 1.06358576, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.04479252262380658, + "language_loss": 0.83477217, + "learning_rate": 0.0007140503377003022, + "loss": 0.84568524, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.27734375, + "step": 1967, + "time_per_iteration": 3.0142691135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097939, + "balance_loss_mlp": 1.07011509, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.049620821678558774, + "language_loss": 0.8500334, + "learning_rate": 0.000713768745708599, + "loss": 0.86101276, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.27856445, + "step": 1968, + "time_per_iteration": 2.6556408405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109518, + "balance_loss_mlp": 1.06807137, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.05249502952466034, + "language_loss": 0.7739228, + "learning_rate": 0.0007134870707245085, + "loss": 0.78487462, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.27148438, + "step": 1969, + "time_per_iteration": 3.2944319248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097317, + "balance_loss_mlp": 1.0706377, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06611086672726225, + "language_loss": 0.84358507, + "learning_rate": 0.0007132053128573864, + "loss": 0.85455823, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.26733398, + "step": 1970, + "time_per_iteration": 2.745910167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.07422984, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.07389156257299019, + "language_loss": 0.83986598, + "learning_rate": 0.0007129234722166211, + "loss": 0.8508774, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.26977539, + "step": 1971, + "time_per_iteration": 2.8552701473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095612, + "balance_loss_mlp": 1.06881404, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.0464186232668544, + "language_loss": 0.90731955, + "learning_rate": 0.0007126415489116328, + "loss": 0.91827571, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.26818848, + "step": 1972, + "time_per_iteration": 2.6738507747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089531, + "balance_loss_mlp": 1.06185079, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05397666452651625, + "language_loss": 0.81034803, + "learning_rate": 0.0007123595430518736, + "loss": 0.82124341, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.27685547, + "step": 1973, + "time_per_iteration": 2.8551318645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_mlp": 1.06225908, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07183677804285386, + "language_loss": 0.86159599, + "learning_rate": 0.0007120774547468282, + "loss": 0.87249249, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.27416992, + "step": 1974, + "time_per_iteration": 2.5466248989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091836, + "balance_loss_mlp": 1.06477594, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.057862181788604236, + "language_loss": 0.81643212, + "learning_rate": 0.0007117952841060128, + "loss": 0.82735044, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.27099609, + "step": 1975, + "time_per_iteration": 2.6863863468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010857, + "balance_loss_mlp": 1.05813885, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.06251241790432795, + "language_loss": 0.83861643, + "learning_rate": 0.0007115130312389756, + "loss": 0.84947342, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.27587891, + "step": 1976, + "time_per_iteration": 2.6821115016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088536, + "balance_loss_mlp": 1.0602119, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.063889045898505, + "language_loss": 0.79037011, + "learning_rate": 0.0007112306962552973, + "loss": 0.80125546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.28320312, + "step": 1977, + "time_per_iteration": 2.5958874225616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05877423, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055122671956433805, + "language_loss": 0.85178941, + "learning_rate": 0.0007109482792645896, + "loss": 0.8626554, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.27832031, + "step": 1978, + "time_per_iteration": 2.706073760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081892, + "balance_loss_mlp": 1.05363917, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06407360303991923, + "language_loss": 0.83617824, + "learning_rate": 0.0007106657803764969, + "loss": 0.84699714, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.2824707, + "step": 1979, + "time_per_iteration": 2.7429239749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078619, + "balance_loss_mlp": 1.05022287, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.07177583644367627, + "language_loss": 0.8165133, + "learning_rate": 0.0007103831997006948, + "loss": 0.82729954, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.28393555, + "step": 1980, + "time_per_iteration": 2.7360527515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072489, + "balance_loss_mlp": 1.04361689, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.06360208542685557, + "language_loss": 0.85186386, + "learning_rate": 0.0007101005373468908, + "loss": 0.86258882, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.28833008, + "step": 1981, + "time_per_iteration": 2.925529718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03775024, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.051682910059599525, + "language_loss": 0.86574209, + "learning_rate": 0.0007098177934248242, + "loss": 0.87640351, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.28369141, + "step": 1982, + "time_per_iteration": 2.7813186645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_mlp": 1.03770101, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.06153978169673806, + "language_loss": 0.85434651, + "learning_rate": 0.0007095349680442661, + "loss": 0.86501151, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.2878418, + "step": 1983, + "time_per_iteration": 2.878678321838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.04062414, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.05550499316869274, + "language_loss": 0.78828371, + "learning_rate": 0.0007092520613150188, + "loss": 0.79897726, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.28710938, + "step": 1984, + "time_per_iteration": 2.667602300643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04057729, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.04940974411679134, + "language_loss": 0.81105816, + "learning_rate": 0.0007089690733469165, + "loss": 0.82175809, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.29394531, + "step": 1985, + "time_per_iteration": 2.7445921897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077693, + "balance_loss_mlp": 1.04924965, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.0710841944315155, + "language_loss": 0.82154202, + "learning_rate": 0.000708686004249825, + "loss": 0.8323189, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.28442383, + "step": 1986, + "time_per_iteration": 2.803262948989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_mlp": 1.0459218, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053095768122865476, + "language_loss": 0.91283715, + "learning_rate": 0.0007084028541336413, + "loss": 0.92359161, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.29467773, + "step": 1987, + "time_per_iteration": 2.693894147872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_mlp": 1.04807711, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.04978295407195845, + "language_loss": 0.86100876, + "learning_rate": 0.0007081196231082942, + "loss": 0.87176782, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.27807617, + "step": 1988, + "time_per_iteration": 2.8127198219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05097318, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05417702481979702, + "language_loss": 0.80060172, + "learning_rate": 0.0007078363112837436, + "loss": 0.81139255, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.28125, + "step": 1989, + "time_per_iteration": 2.8839027881622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.04866838, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.05590772319077314, + "language_loss": 0.84895635, + "learning_rate": 0.000707552918769981, + "loss": 0.85972643, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.28344727, + "step": 1990, + "time_per_iteration": 2.4921815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075886, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.05219115858491499, + "language_loss": 0.8389315, + "learning_rate": 0.000707269445677029, + "loss": 0.84969032, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.27563477, + "step": 1991, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_mlp": 1.05205727, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.061454112768806295, + "language_loss": 0.85369635, + "learning_rate": 0.0007069858921149416, + "loss": 0.8645004, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.28344727, + "step": 1992, + "time_per_iteration": 2.953749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077015, + "balance_loss_mlp": 1.04919195, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.04324001999537677, + "language_loss": 0.86024761, + "learning_rate": 0.0007067022581938043, + "loss": 0.87101781, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.27880859, + "step": 1993, + "time_per_iteration": 2.818094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072064, + "balance_loss_mlp": 1.04502726, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06003802076808944, + "language_loss": 0.83055973, + "learning_rate": 0.0007064185440237334, + "loss": 0.84128034, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.27075195, + "step": 1994, + "time_per_iteration": 2.7304775714874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.05043745, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.054248337050939024, + "language_loss": 0.84367561, + "learning_rate": 0.0007061347497148764, + "loss": 0.85445797, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.27807617, + "step": 1995, + "time_per_iteration": 2.747483015060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074409, + "balance_loss_mlp": 1.04706264, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06054830939074019, + "language_loss": 0.86660719, + "learning_rate": 0.0007058508753774122, + "loss": 0.87735128, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.27392578, + "step": 1996, + "time_per_iteration": 2.6960108280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078362, + "balance_loss_mlp": 1.05165958, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.05196412840141252, + "language_loss": 0.86974967, + "learning_rate": 0.0007055669211215505, + "loss": 0.88053334, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.26733398, + "step": 1997, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076337, + "balance_loss_mlp": 1.04775071, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06669720231739994, + "language_loss": 0.77213579, + "learning_rate": 0.0007052828870575322, + "loss": 0.78289914, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.28588867, + "step": 1998, + "time_per_iteration": 2.6813313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_mlp": 1.05808222, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.053007093293579055, + "language_loss": 0.8636111, + "learning_rate": 0.0007049987732956291, + "loss": 0.87446344, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.27197266, + "step": 1999, + "time_per_iteration": 2.9743165969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.04323626, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.046114011394728885, + "language_loss": 0.82846403, + "learning_rate": 0.0007047145799461439, + "loss": 0.83917749, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.28149414, + "step": 2000, + "time_per_iteration": 2.85295033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_mlp": 1.0488013, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.06118237782788499, + "language_loss": 0.8185212, + "learning_rate": 0.00070443030711941, + "loss": 0.82929248, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.28295898, + "step": 2001, + "time_per_iteration": 2.7602195739746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.04918385, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.06801983854699947, + "language_loss": 0.82348108, + "learning_rate": 0.0007041459549257924, + "loss": 0.83426422, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.29101562, + "step": 2002, + "time_per_iteration": 2.8562166690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.04565787, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.07124544558687326, + "language_loss": 0.7826004, + "learning_rate": 0.0007038615234756859, + "loss": 0.79334354, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.28662109, + "step": 2003, + "time_per_iteration": 3.1888484954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_mlp": 1.0429796, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.060193135665447615, + "language_loss": 0.83578098, + "learning_rate": 0.000703577012879517, + "loss": 0.8464973, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.28662109, + "step": 2004, + "time_per_iteration": 2.6438684463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069967, + "balance_loss_mlp": 1.04185688, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.05830751128665357, + "language_loss": 0.8852784, + "learning_rate": 0.0007032924232477423, + "loss": 0.89597809, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.28149414, + "step": 2005, + "time_per_iteration": 2.6632285118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071337, + "balance_loss_mlp": 1.04253602, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.05522600702951118, + "language_loss": 0.8025552, + "learning_rate": 0.0007030077546908493, + "loss": 0.81326854, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.28808594, + "step": 2006, + "time_per_iteration": 2.6748647689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06600749, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.04192005891791234, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84142971, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12255859, + "step": 2007, + "time_per_iteration": 4.758062124252319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084632, + "balance_loss_mlp": 1.05614078, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.06495221526254255, + "language_loss": 0.79320729, + "learning_rate": 0.0007024381812438117, + "loss": 0.80405354, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.28515625, + "step": 2008, + "time_per_iteration": 2.557239532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095356, + "balance_loss_mlp": 1.06607771, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.09570560546772983, + "language_loss": 0.83017313, + "learning_rate": 0.0007021532765747951, + "loss": 0.84112668, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.29248047, + "step": 2009, + "time_per_iteration": 2.984100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.06031561, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05400711762269546, + "language_loss": 0.78963518, + "learning_rate": 0.0007018682934229162, + "loss": 0.80052131, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.28295898, + "step": 2010, + "time_per_iteration": 2.9302892684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080883, + "balance_loss_mlp": 1.05220175, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05212566321061033, + "language_loss": 0.82523775, + "learning_rate": 0.0007015832318988152, + "loss": 0.83604658, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.28662109, + "step": 2011, + "time_per_iteration": 2.65934157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_mlp": 1.0158205, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.016832038405886617, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74917436, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11523438, + "step": 2012, + "time_per_iteration": 4.964378595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076687, + "balance_loss_mlp": 1.04776716, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.05730560331399072, + "language_loss": 0.83868068, + "learning_rate": 0.0007010128741766604, + "loss": 0.84944755, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.28857422, + "step": 2013, + "time_per_iteration": 2.7196977138519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_mlp": 1.04005277, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.0608937159393576, + "language_loss": 0.843593, + "learning_rate": 0.0007007275782000391, + "loss": 0.85428894, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.29492188, + "step": 2014, + "time_per_iteration": 2.635704517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.04351759, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.061731808628827385, + "language_loss": 0.84906852, + "learning_rate": 0.0007004422042940605, + "loss": 0.85979199, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.2878418, + "step": 2015, + "time_per_iteration": 2.500502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072405, + "balance_loss_mlp": 1.04246008, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.06410146749924231, + "language_loss": 0.89413089, + "learning_rate": 0.0007001567525695169, + "loss": 0.90485489, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.29931641, + "step": 2016, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072622, + "balance_loss_mlp": 1.04410672, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.057933083917186774, + "language_loss": 0.83612067, + "learning_rate": 0.0006998712231372303, + "loss": 0.84684694, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.28491211, + "step": 2017, + "time_per_iteration": 3.0175724029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04141831, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.04866320553491467, + "language_loss": 0.86211008, + "learning_rate": 0.0006995856161080532, + "loss": 0.87281585, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.29101562, + "step": 2018, + "time_per_iteration": 2.879014015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071313, + "balance_loss_mlp": 1.04193974, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.05910223086818918, + "language_loss": 0.81994784, + "learning_rate": 0.0006992999315928679, + "loss": 0.83066106, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.29345703, + "step": 2019, + "time_per_iteration": 2.794605255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078638, + "balance_loss_mlp": 1.04826391, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.0551019421553566, + "language_loss": 0.86098075, + "learning_rate": 0.0006990141697025871, + "loss": 0.8717671, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.3034668, + "step": 2020, + "time_per_iteration": 2.808492422103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_mlp": 1.04388523, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.03291843471702338, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77415681, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12158203, + "step": 2021, + "time_per_iteration": 4.747381687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04109025, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.0700535467402408, + "language_loss": 0.82436341, + "learning_rate": 0.0006984424142405392, + "loss": 0.83506376, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.28930664, + "step": 2022, + "time_per_iteration": 2.8081154823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070367, + "balance_loss_mlp": 1.04144704, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06604387927811756, + "language_loss": 0.81889653, + "learning_rate": 0.0006981564208907474, + "loss": 0.82960021, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.2890625, + "step": 2023, + "time_per_iteration": 2.615868091583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067731, + "balance_loss_mlp": 1.03947854, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.05337785231387105, + "language_loss": 0.90169919, + "learning_rate": 0.0006978703506098102, + "loss": 0.91237652, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.2824707, + "step": 2024, + "time_per_iteration": 2.7487242221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04292357, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.05102180718564601, + "language_loss": 0.87631416, + "learning_rate": 0.00069758420350879, + "loss": 0.88702166, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.27832031, + "step": 2025, + "time_per_iteration": 2.6278607845306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03802657, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.05496821729843788, + "language_loss": 0.85941356, + "learning_rate": 0.000697297979698779, + "loss": 0.87007421, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.28051758, + "step": 2026, + "time_per_iteration": 2.773711919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072256, + "balance_loss_mlp": 1.0449574, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.054849440695872026, + "language_loss": 0.83735013, + "learning_rate": 0.0006970116792908992, + "loss": 0.84807271, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.27368164, + "step": 2027, + "time_per_iteration": 3.1274263858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071715, + "balance_loss_mlp": 1.04348612, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.0501662810644282, + "language_loss": 0.80959415, + "learning_rate": 0.000696725302396302, + "loss": 0.82031131, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.28222656, + "step": 2028, + "time_per_iteration": 2.653289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078388, + "balance_loss_mlp": 1.050946, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.053195529027894116, + "language_loss": 0.85790342, + "learning_rate": 0.0006964388491261692, + "loss": 0.86868727, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.2746582, + "step": 2029, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082882, + "balance_loss_mlp": 1.0550828, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.06114884672927749, + "language_loss": 0.87352717, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435602, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.27832031, + "step": 2030, + "time_per_iteration": 2.8415944576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083514, + "balance_loss_mlp": 1.0548079, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.056999957489140544, + "language_loss": 0.78065526, + "learning_rate": 0.0006958657139041696, + "loss": 0.79149044, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.28686523, + "step": 2031, + "time_per_iteration": 2.750596761703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_mlp": 1.01660919, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.015090316928766313, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77740502, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.109375, + "step": 2032, + "time_per_iteration": 4.916932106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080774, + "balance_loss_mlp": 1.05371356, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.058882626995900515, + "language_loss": 0.77978921, + "learning_rate": 0.0006952922745149434, + "loss": 0.7905969, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.27099609, + "step": 2033, + "time_per_iteration": 2.6288254261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076329, + "balance_loss_mlp": 1.04802871, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.059683993490508125, + "language_loss": 0.8774389, + "learning_rate": 0.000695005441035888, + "loss": 0.88820225, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.28295898, + "step": 2034, + "time_per_iteration": 2.6451032161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021075, + "balance_loss_mlp": 1.01001287, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.012767183735830537, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74744511, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11083984, + "step": 2035, + "time_per_iteration": 4.875540018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05346835, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.05871453648610719, + "language_loss": 0.8120997, + "learning_rate": 0.0006944315470656863, + "loss": 0.82291067, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.27685547, + "step": 2036, + "time_per_iteration": 2.9991486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079422, + "balance_loss_mlp": 1.05193281, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05954449002694624, + "language_loss": 0.90806162, + "learning_rate": 0.000694144486797345, + "loss": 0.91885585, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.27539062, + "step": 2037, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016452, + "balance_loss_mlp": 1.00543678, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.010331538207496795, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80536884, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.11035156, + "step": 2038, + "time_per_iteration": 4.696615695953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077334, + "balance_loss_mlp": 1.04920101, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.05886678367995608, + "language_loss": 0.89078939, + "learning_rate": 0.0006935701402514156, + "loss": 0.90156269, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.28149414, + "step": 2039, + "time_per_iteration": 2.555340051651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00254571, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.009976601144167605, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74048454, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.11035156, + "step": 2040, + "time_per_iteration": 4.91499400138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04941869, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.0656092448350418, + "language_loss": 0.84421289, + "learning_rate": 0.0006929954931031422, + "loss": 0.8549906, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.28344727, + "step": 2041, + "time_per_iteration": 3.729060649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_mlp": 1.0521127, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05672023255092622, + "language_loss": 0.88579351, + "learning_rate": 0.0006927080570819805, + "loss": 0.8965857, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.27148438, + "step": 2042, + "time_per_iteration": 2.5964105129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05557048, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.07129276434353096, + "language_loss": 0.81115568, + "learning_rate": 0.0006924205462449161, + "loss": 0.82197881, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.26806641, + "step": 2043, + "time_per_iteration": 2.585873603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080679, + "balance_loss_mlp": 1.0537734, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.07610386660927036, + "language_loss": 0.8177464, + "learning_rate": 0.0006921329607035702, + "loss": 0.8285532, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.26940918, + "step": 2044, + "time_per_iteration": 3.238981246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087504, + "balance_loss_mlp": 1.0611347, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.0570655681013956, + "language_loss": 0.87757248, + "learning_rate": 0.0006918453005695938, + "loss": 0.88844752, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.26416016, + "step": 2045, + "time_per_iteration": 2.6602108478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_mlp": 1.06491971, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.055879562404771856, + "language_loss": 0.84307766, + "learning_rate": 0.0006915575659546662, + "loss": 0.85398793, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.26147461, + "step": 2046, + "time_per_iteration": 2.6592600345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_mlp": 1.06476951, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.06494345942268129, + "language_loss": 0.80426449, + "learning_rate": 0.0006912697569704959, + "loss": 0.81517833, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.26623535, + "step": 2047, + "time_per_iteration": 2.613070011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080678, + "balance_loss_mlp": 1.0539515, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.06871552578761372, + "language_loss": 0.86815077, + "learning_rate": 0.0006909818737288205, + "loss": 0.87895757, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.26745605, + "step": 2048, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05919969, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.055462609864315775, + "language_loss": 0.80754077, + "learning_rate": 0.000690693916341406, + "loss": 0.81840289, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.27075195, + "step": 2049, + "time_per_iteration": 2.668114185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_mlp": 1.0532347, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.05123788091691057, + "language_loss": 0.8241666, + "learning_rate": 0.0006904058849200475, + "loss": 0.83496863, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.27001953, + "step": 2050, + "time_per_iteration": 2.7161009311676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084281, + "balance_loss_mlp": 1.05679107, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.06391064418382593, + "language_loss": 0.84741384, + "learning_rate": 0.0006901177795765683, + "loss": 0.8582567, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.27514648, + "step": 2051, + "time_per_iteration": 2.6012356281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082278, + "balance_loss_mlp": 1.05540872, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.059538956745971455, + "language_loss": 0.8114661, + "learning_rate": 0.0006898296004228213, + "loss": 0.82228893, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.26879883, + "step": 2052, + "time_per_iteration": 2.739016056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091682, + "balance_loss_mlp": 1.07909358, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.0435951911950544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79218423, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12597656, + "step": 2053, + "time_per_iteration": 4.853093385696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.0498004, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.061585922129253, + "language_loss": 0.79790258, + "learning_rate": 0.0006892530211320763, + "loss": 0.80867237, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.2722168, + "step": 2054, + "time_per_iteration": 2.695810317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_mlp": 1.05135143, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06739666157176663, + "language_loss": 0.83483803, + "learning_rate": 0.000688964621218926, + "loss": 0.84561741, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.26611328, + "step": 2055, + "time_per_iteration": 2.5957767963409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04496288, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05900978816729325, + "language_loss": 0.79760778, + "learning_rate": 0.0006886761479432037, + "loss": 0.80831754, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.26037598, + "step": 2056, + "time_per_iteration": 2.823195457458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.0479672, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.06325658180551426, + "language_loss": 0.84495139, + "learning_rate": 0.0006883876014169045, + "loss": 0.85570216, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.27148438, + "step": 2057, + "time_per_iteration": 2.504899263381958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05080771, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05952155235087993, + "language_loss": 0.90666497, + "learning_rate": 0.000688098981752052, + "loss": 0.91744673, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.27441406, + "step": 2058, + "time_per_iteration": 2.705845832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079753, + "balance_loss_mlp": 1.05207229, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.057037005783434964, + "language_loss": 0.80068249, + "learning_rate": 0.0006878102890606982, + "loss": 0.81147999, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.27709961, + "step": 2059, + "time_per_iteration": 3.086745500564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108134, + "balance_loss_mlp": 1.0542556, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.07822530462482143, + "language_loss": 0.80866635, + "learning_rate": 0.0006875215234549239, + "loss": 0.8194797, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.27124023, + "step": 2060, + "time_per_iteration": 2.5814599990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080083, + "balance_loss_mlp": 1.05221188, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.06673254145899743, + "language_loss": 0.85142004, + "learning_rate": 0.0006872326850468376, + "loss": 0.86222088, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.27880859, + "step": 2061, + "time_per_iteration": 2.6693742275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081472, + "balance_loss_mlp": 1.05343366, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.06184749895138045, + "language_loss": 0.78875667, + "learning_rate": 0.0006869437739485762, + "loss": 0.79957139, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.28051758, + "step": 2062, + "time_per_iteration": 2.612020969390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108316, + "balance_loss_mlp": 1.05493176, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.07174128592683177, + "language_loss": 0.92295337, + "learning_rate": 0.0006866547902723053, + "loss": 0.93378496, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.2824707, + "step": 2063, + "time_per_iteration": 2.676013469696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_mlp": 1.05300224, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05898261192449876, + "language_loss": 0.80094039, + "learning_rate": 0.000686365734130218, + "loss": 0.81175387, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.28369141, + "step": 2064, + "time_per_iteration": 2.7021024227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071448, + "balance_loss_mlp": 1.0426228, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.09101918864834832, + "language_loss": 0.83948302, + "learning_rate": 0.000686076605634536, + "loss": 0.85019755, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.28808594, + "step": 2065, + "time_per_iteration": 2.6558356285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068247, + "balance_loss_mlp": 1.03963661, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.05840936356543045, + "language_loss": 0.83999312, + "learning_rate": 0.0006857874048975088, + "loss": 0.85067558, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.28613281, + "step": 2066, + "time_per_iteration": 2.556900978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068316, + "balance_loss_mlp": 1.04027796, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.07585091480167282, + "language_loss": 0.87176585, + "learning_rate": 0.0006854981320314142, + "loss": 0.88244903, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.28027344, + "step": 2067, + "time_per_iteration": 2.445798635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04426003, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.08763476788371415, + "language_loss": 0.86982906, + "learning_rate": 0.0006852087871485579, + "loss": 0.88055265, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.28125, + "step": 2068, + "time_per_iteration": 2.6390161514282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_mlp": 1.04861069, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.065510260101048, + "language_loss": 0.82088625, + "learning_rate": 0.0006849193703612735, + "loss": 0.83165061, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.27856445, + "step": 2069, + "time_per_iteration": 2.763023614883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_mlp": 1.04346275, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.058439166966186944, + "language_loss": 0.77565378, + "learning_rate": 0.0006846298817819225, + "loss": 0.78636372, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.27563477, + "step": 2070, + "time_per_iteration": 2.948054790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070331, + "balance_loss_mlp": 1.04296088, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.06370866866163034, + "language_loss": 0.80921137, + "learning_rate": 0.0006843403215228945, + "loss": 0.8199147, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.27392578, + "step": 2071, + "time_per_iteration": 2.440274953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075017, + "balance_loss_mlp": 1.04771829, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.05754797735781241, + "language_loss": 0.80491692, + "learning_rate": 0.0006840506896966065, + "loss": 0.81566709, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.2734375, + "step": 2072, + "time_per_iteration": 2.7141849994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076402, + "balance_loss_mlp": 1.04874492, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.06436648215160112, + "language_loss": 0.82351565, + "learning_rate": 0.0006837609864155038, + "loss": 0.83427966, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.27685547, + "step": 2073, + "time_per_iteration": 2.8728160858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107952, + "balance_loss_mlp": 1.05267441, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.06075069456973031, + "language_loss": 0.83255166, + "learning_rate": 0.0006834712117920592, + "loss": 0.84334683, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.26855469, + "step": 2074, + "time_per_iteration": 2.6078460216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081959, + "balance_loss_mlp": 1.05458879, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.08105254072349301, + "language_loss": 0.85028476, + "learning_rate": 0.0006831813659387729, + "loss": 0.86110437, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.27416992, + "step": 2075, + "time_per_iteration": 2.5435502529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080066, + "balance_loss_mlp": 1.05236197, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05543733258884828, + "language_loss": 0.84105802, + "learning_rate": 0.0006828914489681733, + "loss": 0.85185862, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.27758789, + "step": 2076, + "time_per_iteration": 2.716728687286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_mlp": 1.05186319, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05894989539880716, + "language_loss": 0.8515023, + "learning_rate": 0.0006826014609928162, + "loss": 0.86230129, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.28027344, + "step": 2077, + "time_per_iteration": 2.740797996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_mlp": 1.02490366, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.025465037646940157, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84235638, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.11328125, + "step": 2078, + "time_per_iteration": 4.832703590393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.05287147, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.11662193334808049, + "language_loss": 0.8017869, + "learning_rate": 0.0006820212724781896, + "loss": 0.81259406, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.27880859, + "step": 2079, + "time_per_iteration": 2.6742663383483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076717, + "balance_loss_mlp": 1.0488224, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.08177152300224107, + "language_loss": 0.83806193, + "learning_rate": 0.0006817310721641694, + "loss": 0.84882903, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.27905273, + "step": 2080, + "time_per_iteration": 2.8349008560180664 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4716113890902016.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/training_args.bin b/sft_pretrain/Full_smoe_share/checkpoint-2080/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-2080/zero_to_fp32.py b/sft_pretrain/Full_smoe_share/checkpoint-2080/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-2080/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/added_tokens.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/config.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9b0c4407eef6bd7d8c22453f95c43fd6ef0981 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/generation_config.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3de0b432e9170561d0a11b2fba3b8d6465ae7f5 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ae6e8cc4a6de0886b95661a75d9c10df15f358a4e7cebc0296c3b6049941f52 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6874c6f76958e15f6d327576fc6b4f179853dc5 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:396a9767f14871bbb167d913cd8f547b63e3d48f13017a5dfb29467b96a963ba +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0c619a6182b94b49c79783d3bf4ea7fa46fc846 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7f437b40c72488ed444ff02c0eb64e4a0a7a8a2ad73196abcf1a42b42677f88 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..707a7820144f082fcb1459bb638661311026a146 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457b25363facf47ef7c6150146b8df67ea639e1c24b532edc28b7d9e3074b3b7 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..748243ca6702afc48f4aba1bf247b1ed4593d396 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b980a2682d912eeb4bc1425e2695aaff0c01a401df2f8a39e3024802191c146 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c4651dadf863537eced63f65260454a4e0c9a14 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f48fcf007aea2e8bf4c61736c9e5273e3c3021ef2ab632d3732c5e7152125942 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbce7db42afac8077feef21bb0ab610fc9fea075 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13f9cc1bcc98d732f0fa4e95323724a293fcdff8c08c4235192cc11705b7d32 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..46f9507e20843ca44090df67a0a55a3f0733b42c --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/global_step3120/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a243b3a22b87dcdc8fb64b738bb9a59d44977b47783ce757494e59956988b2b7 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/latest b/sft_pretrain/Full_smoe_share/checkpoint-3120/latest new file mode 100644 index 0000000000000000000000000000000000000000..804da059f781bacb3f274fb2103e4bc7f9bb7407 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/latest @@ -0,0 +1 @@ +global_step3120 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8836b2158909eb280c99bdecaed07115ecce089a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f8c081fed22c52dfa3e42d23c252b8796141caae9f4cbb4a082f3f7c34f7dc +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_0.pth b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_1.pth b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_2.pth b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_3.pth b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/special_tokens_map.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer.model b/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer_config.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/trainer_state.json b/sft_pretrain/Full_smoe_share/checkpoint-3120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e7c8c5a85de7d6bc8681edc631682abe66f0f05e --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/trainer_state.json @@ -0,0 +1,46833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002308580223162, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.10615349, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.07166596959521815, + "language_loss": 0.84091032, + "learning_rate": 0.000925888133132719, + "loss": 0.852319, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.34741211, + "step": 1041, + "time_per_iteration": 2.791015148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724521, + "balance_loss_mlp": 1.67225933, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.16089622263247963, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8133496, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.5234375, + "step": 1042, + "time_per_iteration": 4.978717565536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116458, + "balance_loss_mlp": 1.08169639, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.06766738281342395, + "language_loss": 0.80769098, + "learning_rate": 0.0009255613649386244, + "loss": 0.81885552, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.34790039, + "step": 1043, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_mlp": 1.08709943, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.07361728486384381, + "language_loss": 0.78999138, + "learning_rate": 0.0009253977329834838, + "loss": 0.80121642, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.35449219, + "step": 1044, + "time_per_iteration": 2.7036681175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_mlp": 1.07227719, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.08623717161971375, + "language_loss": 0.86596096, + "learning_rate": 0.0009252339358742965, + "loss": 0.87704492, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.36108398, + "step": 1045, + "time_per_iteration": 2.874620199203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118791, + "balance_loss_mlp": 1.08369565, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.06963930913543727, + "language_loss": 0.82984746, + "learning_rate": 0.000925069973674654, + "loss": 0.84103537, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.35107422, + "step": 1046, + "time_per_iteration": 2.628878116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_mlp": 1.07017231, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.07870556033127275, + "language_loss": 0.88610631, + "learning_rate": 0.000924905846448212, + "loss": 0.89716709, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.35913086, + "step": 1047, + "time_per_iteration": 2.747220754623413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0750165, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.10747792176710873, + "language_loss": 0.85372317, + "learning_rate": 0.0009247415542586906, + "loss": 0.86482, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34667969, + "step": 1048, + "time_per_iteration": 2.8556973934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.08285666, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.2214820598260846, + "language_loss": 0.83177209, + "learning_rate": 0.0009245770971698735, + "loss": 0.84296525, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.36450195, + "step": 1049, + "time_per_iteration": 2.9050869941711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132964, + "balance_loss_mlp": 1.09798741, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.08175342307012821, + "language_loss": 0.88327754, + "learning_rate": 0.0009244124752456087, + "loss": 0.89460719, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.34985352, + "step": 1050, + "time_per_iteration": 2.5141613483428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151097, + "balance_loss_mlp": 1.11557305, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.06393011823673703, + "language_loss": 0.85371649, + "learning_rate": 0.0009242476885498081, + "loss": 0.86522746, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.35522461, + "step": 1051, + "time_per_iteration": 2.727687358856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176333, + "balance_loss_mlp": 1.14171457, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.09914193731013146, + "language_loss": 0.80802011, + "learning_rate": 0.0009240827371464474, + "loss": 0.81978351, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.34643555, + "step": 1052, + "time_per_iteration": 2.552121877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191475, + "balance_loss_mlp": 1.15521157, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.1023503287046967, + "language_loss": 0.83863074, + "learning_rate": 0.0009239176210995666, + "loss": 0.85054547, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.36230469, + "step": 1053, + "time_per_iteration": 3.47882342338562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190284, + "balance_loss_mlp": 1.15561819, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.09115683042396579, + "language_loss": 0.93677175, + "learning_rate": 0.0009237523404732695, + "loss": 0.94867456, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34692383, + "step": 1054, + "time_per_iteration": 2.8701720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173476, + "balance_loss_mlp": 1.13838029, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.10782024136876088, + "language_loss": 0.8421399, + "learning_rate": 0.0009235868953317235, + "loss": 0.85387468, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.3515625, + "step": 1055, + "time_per_iteration": 2.8210723400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161281, + "balance_loss_mlp": 1.12682986, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.07346272336072437, + "language_loss": 0.85227096, + "learning_rate": 0.0009234212857391602, + "loss": 0.86388373, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.3449707, + "step": 1056, + "time_per_iteration": 3.2212936878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153084, + "balance_loss_mlp": 1.11727369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.054845505201833546, + "language_loss": 0.89240777, + "learning_rate": 0.000923255511759875, + "loss": 0.90393853, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.3581543, + "step": 1057, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156175, + "balance_loss_mlp": 1.12146115, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.10969304378799022, + "language_loss": 0.84913409, + "learning_rate": 0.000923089573458227, + "loss": 0.86069584, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.34716797, + "step": 1058, + "time_per_iteration": 2.8832740783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.1168946, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.24205150411640483, + "language_loss": 0.83790255, + "learning_rate": 0.0009229234708986392, + "loss": 0.84941626, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.3449707, + "step": 1059, + "time_per_iteration": 2.8837289810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01633401, + "balance_loss_mlp": 1.57885134, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.08953482343612705, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83300292, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.546875, + "step": 1060, + "time_per_iteration": 4.667459011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158699, + "balance_loss_mlp": 1.1247009, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.0736942782322193, + "language_loss": 0.84963936, + "learning_rate": 0.0009225907732636548, + "loss": 0.86122632, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.34033203, + "step": 1061, + "time_per_iteration": 2.7532095909118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_mlp": 1.12954497, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.09512005659435491, + "language_loss": 0.8641578, + "learning_rate": 0.0009224241783174227, + "loss": 0.87580323, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.35009766, + "step": 1062, + "time_per_iteration": 2.683047294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147761, + "balance_loss_mlp": 1.11347604, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.07955707081408017, + "language_loss": 0.85456479, + "learning_rate": 0.0009222574193715802, + "loss": 0.86604244, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.34326172, + "step": 1063, + "time_per_iteration": 2.8293161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_mlp": 1.10474837, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.08617592440024102, + "language_loss": 0.85715151, + "learning_rate": 0.000922090496490869, + "loss": 0.8685447, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.34619141, + "step": 1064, + "time_per_iteration": 2.749298334121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.08865011, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.06572729358097257, + "language_loss": 0.89767212, + "learning_rate": 0.0009219234097400937, + "loss": 0.90891409, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.35595703, + "step": 1065, + "time_per_iteration": 2.8508355617523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_mlp": 1.07175696, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.05918330788086957, + "language_loss": 0.82970631, + "learning_rate": 0.0009217561591841237, + "loss": 0.8407777, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.35400391, + "step": 1066, + "time_per_iteration": 3.3216452598571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102073, + "balance_loss_mlp": 1.06566656, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09526156176010836, + "language_loss": 0.81088316, + "learning_rate": 0.0009215887448878913, + "loss": 0.82190394, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.36401367, + "step": 1067, + "time_per_iteration": 2.596022129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06191611, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.072135210200994, + "language_loss": 0.84963661, + "learning_rate": 0.0009214211669163922, + "loss": 0.86063439, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.37841797, + "step": 1068, + "time_per_iteration": 4.440082311630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096187, + "balance_loss_mlp": 1.05923223, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.07010547570027807, + "language_loss": 0.93398243, + "learning_rate": 0.0009212534253346862, + "loss": 0.94494426, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.36938477, + "step": 1069, + "time_per_iteration": 2.699843406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096083, + "balance_loss_mlp": 1.05912852, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.07799270520419531, + "language_loss": 0.83685625, + "learning_rate": 0.0009210855202078964, + "loss": 0.84781706, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.36962891, + "step": 1070, + "time_per_iteration": 2.5999720096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010932, + "balance_loss_mlp": 1.05810475, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.0723710550133871, + "language_loss": 0.86933672, + "learning_rate": 0.0009209174516012091, + "loss": 0.88026869, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.35131836, + "step": 1071, + "time_per_iteration": 2.503551483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.05794883, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.05962541016594441, + "language_loss": 0.88928151, + "learning_rate": 0.0009207492195798747, + "loss": 0.90020716, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.34667969, + "step": 1072, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094226, + "balance_loss_mlp": 1.05972731, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.06398863953592046, + "language_loss": 0.84846818, + "learning_rate": 0.0009205808242092061, + "loss": 0.85941041, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34521484, + "step": 1073, + "time_per_iteration": 2.644134044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_mlp": 1.06080186, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.06666861242543158, + "language_loss": 0.82488537, + "learning_rate": 0.0009204122655545808, + "loss": 0.83583593, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34277344, + "step": 1074, + "time_per_iteration": 3.3254919052124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.07582152, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.0719401545163873, + "language_loss": 0.81125832, + "learning_rate": 0.0009202435436814388, + "loss": 0.82235849, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.34228516, + "step": 1075, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105303, + "balance_loss_mlp": 1.0707798, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.06775779875999222, + "language_loss": 0.89715004, + "learning_rate": 0.0009200746586552836, + "loss": 0.90820301, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 2.897177219390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_mlp": 1.06869972, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.12065235325240355, + "language_loss": 0.83624744, + "learning_rate": 0.0009199056105416825, + "loss": 0.84727275, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33862305, + "step": 1077, + "time_per_iteration": 3.0771028995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_mlp": 1.07218289, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.06486814220319007, + "language_loss": 0.8622663, + "learning_rate": 0.0009197363994062654, + "loss": 0.8733272, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.33935547, + "step": 1078, + "time_per_iteration": 2.807009696960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112785, + "balance_loss_mlp": 1.07914448, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.06985523034062016, + "language_loss": 0.84313667, + "learning_rate": 0.0009195670253147262, + "loss": 0.85426456, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.33642578, + "step": 1079, + "time_per_iteration": 2.9738564491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114515, + "balance_loss_mlp": 1.0817802, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.09202653272357895, + "language_loss": 0.81912923, + "learning_rate": 0.0009193974883328216, + "loss": 0.8302744, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32739258, + "step": 1080, + "time_per_iteration": 2.639878511428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121501, + "balance_loss_mlp": 1.08721614, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.059797822691547486, + "language_loss": 0.86745334, + "learning_rate": 0.0009192277885263718, + "loss": 0.87866837, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.34326172, + "step": 1081, + "time_per_iteration": 4.060026407241821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.08671248, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.0682125291941454, + "language_loss": 0.86169523, + "learning_rate": 0.0009190579259612602, + "loss": 0.87289995, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33789062, + "step": 1082, + "time_per_iteration": 3.2795815467834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134326, + "balance_loss_mlp": 1.10132933, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.06852391956291448, + "language_loss": 0.86675245, + "learning_rate": 0.000918887900703433, + "loss": 0.87809569, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.33007812, + "step": 1083, + "time_per_iteration": 2.813777208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137242, + "balance_loss_mlp": 1.1025995, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.07184608102087402, + "language_loss": 0.90139276, + "learning_rate": 0.0009187177128188999, + "loss": 0.91276515, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.34667969, + "step": 1084, + "time_per_iteration": 2.4950854778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361857, + "balance_loss_mlp": 1.30883229, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.057507491560350586, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78518397, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.53125, + "step": 1085, + "time_per_iteration": 4.9323132038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.08279717, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.0734883897044225, + "language_loss": 0.85634506, + "learning_rate": 0.000918376849434071, + "loss": 0.86751348, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.34057617, + "step": 1086, + "time_per_iteration": 2.504467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110856, + "balance_loss_mlp": 1.07680964, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07305298195252904, + "language_loss": 0.90630972, + "learning_rate": 0.0009182061740661098, + "loss": 0.91741836, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34057617, + "step": 1087, + "time_per_iteration": 2.5760254859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_mlp": 1.0785315, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05349746945174757, + "language_loss": 0.84760422, + "learning_rate": 0.0009180353363361127, + "loss": 0.85873878, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.34912109, + "step": 1088, + "time_per_iteration": 3.0988333225250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111767, + "balance_loss_mlp": 1.07593286, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.0658577902216117, + "language_loss": 0.81715566, + "learning_rate": 0.0009178643363104044, + "loss": 0.82827336, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.35864258, + "step": 1089, + "time_per_iteration": 3.1410629749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_mlp": 1.07155704, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.10460691940838339, + "language_loss": 0.90569937, + "learning_rate": 0.0009176931740553735, + "loss": 0.91676497, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.35009766, + "step": 1090, + "time_per_iteration": 2.529330253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112911, + "balance_loss_mlp": 1.07698107, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.07113631656774884, + "language_loss": 0.82557011, + "learning_rate": 0.0009175218496374708, + "loss": 0.83669925, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.359375, + "step": 1091, + "time_per_iteration": 3.347742795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110472, + "balance_loss_mlp": 1.07356465, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.08284412758413852, + "language_loss": 0.85813856, + "learning_rate": 0.0009173503631232103, + "loss": 0.86924326, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.36914062, + "step": 1092, + "time_per_iteration": 3.378859758377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.06684804, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.09413161778101656, + "language_loss": 0.81595004, + "learning_rate": 0.0009171787145791691, + "loss": 0.82698447, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.36621094, + "step": 1093, + "time_per_iteration": 3.215574026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_mlp": 1.06214595, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.0806437411167059, + "language_loss": 0.80327773, + "learning_rate": 0.000917006904071987, + "loss": 0.81427377, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.37451172, + "step": 1094, + "time_per_iteration": 2.6117537021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_mlp": 1.06377053, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.08991830585001004, + "language_loss": 0.87576157, + "learning_rate": 0.0009168349316683669, + "loss": 0.88676262, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.36352539, + "step": 1095, + "time_per_iteration": 2.740950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_mlp": 1.06650949, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.06267137937039592, + "language_loss": 0.8218863, + "learning_rate": 0.0009166627974350741, + "loss": 0.83290446, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.35327148, + "step": 1096, + "time_per_iteration": 2.887326240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098665, + "balance_loss_mlp": 1.06206763, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.07019696164219995, + "language_loss": 0.89238816, + "learning_rate": 0.0009164905014389373, + "loss": 0.90337479, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.3659668, + "step": 1097, + "time_per_iteration": 2.7609455585479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105326, + "balance_loss_mlp": 1.06908655, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.06528725154368942, + "language_loss": 0.8638711, + "learning_rate": 0.0009163180437468476, + "loss": 0.87492442, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.36254883, + "step": 1098, + "time_per_iteration": 2.5998973846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096402, + "balance_loss_mlp": 1.06009042, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.06547964129234486, + "language_loss": 0.85908926, + "learning_rate": 0.000916145424425759, + "loss": 0.87005323, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.36303711, + "step": 1099, + "time_per_iteration": 2.6804425716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06601155, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.08063804967749887, + "language_loss": 0.90475744, + "learning_rate": 0.0009159726435426885, + "loss": 0.91577733, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.35986328, + "step": 1100, + "time_per_iteration": 3.1017394065856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_mlp": 1.06499124, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.08023517310436831, + "language_loss": 0.90250683, + "learning_rate": 0.0009157997011647154, + "loss": 0.9135161, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.359375, + "step": 1101, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_mlp": 1.06045425, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05508329212621071, + "language_loss": 0.86001104, + "learning_rate": 0.0009156265973589817, + "loss": 0.87097728, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.36206055, + "step": 1102, + "time_per_iteration": 2.7933261394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097006, + "balance_loss_mlp": 1.06121981, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.06583201442001711, + "language_loss": 0.89802408, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899414, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.35791016, + "step": 1103, + "time_per_iteration": 2.647494316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096343, + "balance_loss_mlp": 1.0598892, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06603869229078199, + "language_loss": 0.87027407, + "learning_rate": 0.0009152799057331156, + "loss": 0.88123751, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.36499023, + "step": 1104, + "time_per_iteration": 3.1623916625976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097231, + "balance_loss_mlp": 1.06134939, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.07161611233178561, + "language_loss": 0.90831178, + "learning_rate": 0.0009151063180475805, + "loss": 0.91928405, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.35913086, + "step": 1105, + "time_per_iteration": 2.5515594482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099591, + "balance_loss_mlp": 1.06516361, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.08899576142412509, + "language_loss": 0.83941323, + "learning_rate": 0.0009149325692034803, + "loss": 0.85040915, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.34472656, + "step": 1106, + "time_per_iteration": 2.561875343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300575, + "balance_loss_mlp": 1.25708735, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.05662804479307553, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80504, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.43554688, + "step": 1107, + "time_per_iteration": 4.880220174789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_mlp": 1.06870413, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.06711298172071122, + "language_loss": 0.87037283, + "learning_rate": 0.0009145845883094678, + "loss": 0.88141322, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.35375977, + "step": 1108, + "time_per_iteration": 3.0598409175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_mlp": 1.06931639, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.06803775359788228, + "language_loss": 0.8464098, + "learning_rate": 0.000914410356394654, + "loss": 0.85746086, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.35839844, + "step": 1109, + "time_per_iteration": 2.776258945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_mlp": 1.06799972, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.052025780444459935, + "language_loss": 0.84733951, + "learning_rate": 0.0009142359635914709, + "loss": 0.85837853, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.35913086, + "step": 1110, + "time_per_iteration": 3.057307243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096278, + "balance_loss_mlp": 1.05996692, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.10914443694781037, + "language_loss": 0.84286684, + "learning_rate": 0.0009140614099676245, + "loss": 0.85382962, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.36328125, + "step": 1111, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.0517633, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.09545242357915729, + "language_loss": 0.82540983, + "learning_rate": 0.0009138866955908821, + "loss": 0.83628869, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.36132812, + "step": 1112, + "time_per_iteration": 2.870765209197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_mlp": 1.06445658, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06321568237144509, + "language_loss": 0.8048408, + "learning_rate": 0.0009137118205290738, + "loss": 0.8158437, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.35864258, + "step": 1113, + "time_per_iteration": 4.381570100784302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_mlp": 1.06091869, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06328361159326604, + "language_loss": 0.89779603, + "learning_rate": 0.0009135367848500924, + "loss": 0.90876651, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.36157227, + "step": 1114, + "time_per_iteration": 2.511164665222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.06034184, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.08987717155463379, + "language_loss": 0.86417669, + "learning_rate": 0.0009133615886218927, + "loss": 0.87514299, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.36303711, + "step": 1115, + "time_per_iteration": 2.7101125717163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089806, + "balance_loss_mlp": 1.05337584, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.07119429557645003, + "language_loss": 0.87869287, + "learning_rate": 0.0009131862319124917, + "loss": 0.88959092, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.36425781, + "step": 1116, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.05648971, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06965010238630005, + "language_loss": 0.83447617, + "learning_rate": 0.0009130107147899691, + "loss": 0.84540606, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.36499023, + "step": 1117, + "time_per_iteration": 2.723092794418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_mlp": 1.05805993, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.055087901571477416, + "language_loss": 0.84983969, + "learning_rate": 0.0009128350373224665, + "loss": 0.8607831, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.36352539, + "step": 1118, + "time_per_iteration": 2.5449509620666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178954, + "balance_loss_mlp": 1.14500344, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.021865185871831474, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82635385, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.33984375, + "step": 1119, + "time_per_iteration": 4.641271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_mlp": 1.06648207, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.07523243301623007, + "language_loss": 0.85678464, + "learning_rate": 0.0009124832016254005, + "loss": 0.86781639, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.36694336, + "step": 1120, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_mlp": 1.06163859, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.07092227494936269, + "language_loss": 0.87677884, + "learning_rate": 0.0009123070435324316, + "loss": 0.88775837, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.36352539, + "step": 1121, + "time_per_iteration": 2.777632236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166186, + "balance_loss_mlp": 1.13337982, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.01899876446696313, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.7904197, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.328125, + "step": 1122, + "time_per_iteration": 4.966520547866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.0522635, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.060329223802114536, + "language_loss": 0.86415493, + "learning_rate": 0.0009119542471995752, + "loss": 0.87504709, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.36938477, + "step": 1123, + "time_per_iteration": 2.8373889923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090311, + "balance_loss_mlp": 1.05438125, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06176848453484022, + "language_loss": 0.81323773, + "learning_rate": 0.0009117776090966554, + "loss": 0.82414079, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.359375, + "step": 1124, + "time_per_iteration": 2.999127149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_mlp": 1.0507102, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.07470238986110685, + "language_loss": 0.86757743, + "learning_rate": 0.0009116008111274899, + "loss": 0.87845105, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.36669922, + "step": 1125, + "time_per_iteration": 3.3534371852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160744, + "balance_loss_mlp": 1.13022673, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.021433456679081614, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80267668, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.3046875, + "step": 1126, + "time_per_iteration": 4.8522608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086571, + "balance_loss_mlp": 1.04975939, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.07895568764354688, + "language_loss": 0.85050654, + "learning_rate": 0.0009112467358650396, + "loss": 0.86137229, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.36816406, + "step": 1127, + "time_per_iteration": 3.157684803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05472374, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.05660039583272807, + "language_loss": 0.86175025, + "learning_rate": 0.0009110694587092192, + "loss": 0.87265825, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.36108398, + "step": 1128, + "time_per_iteration": 2.755575656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.052562, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.077592311143443, + "language_loss": 0.81304091, + "learning_rate": 0.0009108920219620815, + "loss": 0.82392299, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35693359, + "step": 1129, + "time_per_iteration": 2.639261484146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_mlp": 1.05548096, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.06998872933736075, + "language_loss": 0.8949976, + "learning_rate": 0.0009107144256925133, + "loss": 0.90590858, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35620117, + "step": 1130, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096157, + "balance_loss_mlp": 1.0606091, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.08228743876345572, + "language_loss": 0.81527102, + "learning_rate": 0.0009105366699694638, + "loss": 0.82623267, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.35546875, + "step": 1131, + "time_per_iteration": 2.726532220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087405, + "balance_loss_mlp": 1.0526911, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.05363867293402688, + "language_loss": 0.81731898, + "learning_rate": 0.0009103587548619439, + "loss": 0.82819301, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.34741211, + "step": 1132, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.05978799, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.0659512575968049, + "language_loss": 0.85836411, + "learning_rate": 0.0009101806804390261, + "loss": 0.8693251, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.36328125, + "step": 1133, + "time_per_iteration": 2.789860725402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093043, + "balance_loss_mlp": 1.056494, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.06887538910693401, + "language_loss": 0.90261114, + "learning_rate": 0.0009100024467698453, + "loss": 0.91354156, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.3659668, + "step": 1134, + "time_per_iteration": 2.6074166297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.05786586, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07516267041517319, + "language_loss": 0.82424915, + "learning_rate": 0.0009098240539235981, + "loss": 0.83520383, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.37573242, + "step": 1135, + "time_per_iteration": 2.6695401668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095721, + "balance_loss_mlp": 1.05809808, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.07818229339121877, + "language_loss": 0.87811279, + "learning_rate": 0.0009096455019695423, + "loss": 0.88907003, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.3762207, + "step": 1136, + "time_per_iteration": 4.259606838226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.05180001, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.07138569527580692, + "language_loss": 0.89539087, + "learning_rate": 0.000909466790976998, + "loss": 0.90628058, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.37182617, + "step": 1137, + "time_per_iteration": 2.4586610794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086709, + "balance_loss_mlp": 1.0483948, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.07428895088203294, + "language_loss": 0.82083362, + "learning_rate": 0.0009092879210153473, + "loss": 0.83170068, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.38305664, + "step": 1138, + "time_per_iteration": 3.097928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_mlp": 1.04944801, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.07001266476470332, + "language_loss": 0.88581419, + "learning_rate": 0.0009091088921540333, + "loss": 0.89668703, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.37817383, + "step": 1139, + "time_per_iteration": 2.5904369354248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138075, + "balance_loss_mlp": 1.11270714, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.032290681216211516, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76646751, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.25390625, + "step": 1140, + "time_per_iteration": 4.913591623306274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_mlp": 1.05353999, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.1397659602768512, + "language_loss": 0.84288347, + "learning_rate": 0.0009087503580104985, + "loss": 0.85378748, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.36865234, + "step": 1141, + "time_per_iteration": 2.6825575828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_mlp": 1.06602514, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0722566511462073, + "language_loss": 0.79141879, + "learning_rate": 0.0009085708528674728, + "loss": 0.80245048, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.37133789, + "step": 1142, + "time_per_iteration": 2.8078551292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.06551528, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.06720954872782575, + "language_loss": 0.8638975, + "learning_rate": 0.0009083911891031745, + "loss": 0.87494051, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.38793945, + "step": 1143, + "time_per_iteration": 3.1356892585754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.07328963, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.08162422903338651, + "language_loss": 0.91253042, + "learning_rate": 0.0009082113667873553, + "loss": 0.92363143, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3684082, + "step": 1144, + "time_per_iteration": 3.1446304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112165, + "balance_loss_mlp": 1.07387483, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.0676762249982335, + "language_loss": 0.90471655, + "learning_rate": 0.0009080313859898283, + "loss": 0.91583818, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.38256836, + "step": 1145, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110814, + "balance_loss_mlp": 1.07082736, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.13336101787368373, + "language_loss": 0.91929018, + "learning_rate": 0.0009078512467804684, + "loss": 0.93037164, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.37304688, + "step": 1146, + "time_per_iteration": 2.6156158447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105973, + "balance_loss_mlp": 1.06882787, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06165136945539885, + "language_loss": 0.89993024, + "learning_rate": 0.0009076709492292119, + "loss": 0.91098994, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.37133789, + "step": 1147, + "time_per_iteration": 2.617534875869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095299, + "balance_loss_mlp": 1.06032324, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.11177878536303132, + "language_loss": 0.88637269, + "learning_rate": 0.0009074904934060562, + "loss": 0.89732569, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34985352, + "step": 1148, + "time_per_iteration": 2.6782190799713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086783, + "balance_loss_mlp": 1.05237889, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.0637571078176039, + "language_loss": 0.84905714, + "learning_rate": 0.0009073098793810607, + "loss": 0.85992491, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.34423828, + "step": 1149, + "time_per_iteration": 2.956638813018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085311, + "balance_loss_mlp": 1.04969168, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07731387173425769, + "language_loss": 0.8803097, + "learning_rate": 0.000907129107224346, + "loss": 0.89116287, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35595703, + "step": 1150, + "time_per_iteration": 2.724456548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04623771, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.0527541061714234, + "language_loss": 0.88156152, + "learning_rate": 0.0009069481770060939, + "loss": 0.89237529, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35180664, + "step": 1151, + "time_per_iteration": 2.6539950370788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.04811299, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.06610336138884995, + "language_loss": 0.83768857, + "learning_rate": 0.000906767088796548, + "loss": 0.84853232, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.36279297, + "step": 1152, + "time_per_iteration": 3.4304041862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.05147004, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.06692160227790218, + "language_loss": 0.87012255, + "learning_rate": 0.0009065858426660127, + "loss": 0.88099682, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.35986328, + "step": 1153, + "time_per_iteration": 2.639326333999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_mlp": 1.05480099, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.07963844060104928, + "language_loss": 0.84658396, + "learning_rate": 0.0009064044386848543, + "loss": 0.85748196, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.3503418, + "step": 1154, + "time_per_iteration": 2.904387950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_mlp": 1.05992007, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.07985092329826342, + "language_loss": 0.88786525, + "learning_rate": 0.0009062228769234997, + "loss": 0.89881229, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.34838867, + "step": 1155, + "time_per_iteration": 2.547041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095087, + "balance_loss_mlp": 1.05977738, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.067267193175655, + "language_loss": 0.80872244, + "learning_rate": 0.0009060411574524376, + "loss": 0.81967336, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35327148, + "step": 1156, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_mlp": 1.06561852, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.07018019580992392, + "language_loss": 0.87947989, + "learning_rate": 0.0009058592803422178, + "loss": 0.8904835, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34765625, + "step": 1157, + "time_per_iteration": 3.161827564239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_mlp": 1.05688405, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.0269537140509509, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79798073, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.30859375, + "step": 1158, + "time_per_iteration": 4.827271223068237 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_mlp": 1.06608617, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.10870396219255896, + "language_loss": 0.89957273, + "learning_rate": 0.00090549505348681, + "loss": 0.91057909, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.34594727, + "step": 1159, + "time_per_iteration": 2.5724213123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_mlp": 1.08144796, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.06607938149323832, + "language_loss": 0.83976638, + "learning_rate": 0.0009053127038830275, + "loss": 0.85092539, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.3449707, + "step": 1160, + "time_per_iteration": 2.979442834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108838, + "balance_loss_mlp": 1.07538772, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.07010640296313479, + "language_loss": 0.86946774, + "learning_rate": 0.000905130196922898, + "loss": 0.88055611, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3347168, + "step": 1161, + "time_per_iteration": 2.582780361175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_mlp": 1.0797379, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.056850955952103474, + "language_loss": 0.86954904, + "learning_rate": 0.0009049475326772769, + "loss": 0.88069069, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.34472656, + "step": 1162, + "time_per_iteration": 2.572434902191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116085, + "balance_loss_mlp": 1.08270645, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.07142312953148652, + "language_loss": 0.82233834, + "learning_rate": 0.0009047647112170811, + "loss": 0.83349919, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.33398438, + "step": 1163, + "time_per_iteration": 2.7467033863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_mlp": 1.07115388, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.07009650422776509, + "language_loss": 0.87291974, + "learning_rate": 0.0009045817326132876, + "loss": 0.88396937, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.33837891, + "step": 1164, + "time_per_iteration": 3.6699986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096597, + "balance_loss_mlp": 1.06150198, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.07687995911666942, + "language_loss": 0.8312459, + "learning_rate": 0.0009043985969369357, + "loss": 0.84221184, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35131836, + "step": 1165, + "time_per_iteration": 2.8716225624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_mlp": 1.06461644, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.062241931717823204, + "language_loss": 0.84419966, + "learning_rate": 0.0009042153042591245, + "loss": 0.85519511, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.34960938, + "step": 1166, + "time_per_iteration": 2.8038439750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094194, + "balance_loss_mlp": 1.05971861, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.05754676867835885, + "language_loss": 0.85229421, + "learning_rate": 0.0009040318546510146, + "loss": 0.86323619, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.3449707, + "step": 1167, + "time_per_iteration": 3.166391372680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_mlp": 1.06672144, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06328547350255756, + "language_loss": 0.84822267, + "learning_rate": 0.0009038482481838275, + "loss": 0.85923845, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.34887695, + "step": 1168, + "time_per_iteration": 2.6582534313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092575, + "balance_loss_mlp": 1.05726552, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05398415615287821, + "language_loss": 0.8685748, + "learning_rate": 0.0009036644849288455, + "loss": 0.87950051, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35327148, + "step": 1169, + "time_per_iteration": 3.131391763687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_mlp": 1.06735337, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06156740204868492, + "language_loss": 0.85189641, + "learning_rate": 0.0009034805649574118, + "loss": 0.86291689, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.34716797, + "step": 1170, + "time_per_iteration": 2.662177801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093313, + "balance_loss_mlp": 1.05991113, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.07489985201842045, + "language_loss": 0.85256809, + "learning_rate": 0.0009032964883409308, + "loss": 0.86350119, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.33422852, + "step": 1171, + "time_per_iteration": 2.872305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_mlp": 0.9971894, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.01784679187957182, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74073857, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.26171875, + "step": 1172, + "time_per_iteration": 4.968618154525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090705, + "balance_loss_mlp": 1.05649197, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.05674331384718379, + "language_loss": 0.87210125, + "learning_rate": 0.0009029278654587462, + "loss": 0.88300836, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.3425293, + "step": 1173, + "time_per_iteration": 2.5812408924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05043077, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06970392839419266, + "language_loss": 0.82089472, + "learning_rate": 0.0009027433193361548, + "loss": 0.83174634, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.34765625, + "step": 1174, + "time_per_iteration": 2.7284860610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_mlp": 1.0550499, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.05615396633220104, + "language_loss": 0.86867499, + "learning_rate": 0.00090255861685474, + "loss": 0.87957788, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.3527832, + "step": 1175, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_mlp": 1.05040812, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06159717434172949, + "language_loss": 0.91109395, + "learning_rate": 0.0009023737580862095, + "loss": 0.92195278, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.35473633, + "step": 1176, + "time_per_iteration": 2.5320050716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089039, + "balance_loss_mlp": 1.05468273, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05820331342721636, + "language_loss": 0.82901466, + "learning_rate": 0.0009021887431023321, + "loss": 0.83990508, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34399414, + "step": 1177, + "time_per_iteration": 2.619271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094278, + "balance_loss_mlp": 1.05939722, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05650773027793175, + "language_loss": 0.86773884, + "learning_rate": 0.0009020035719749369, + "loss": 0.8786816, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.34912109, + "step": 1178, + "time_per_iteration": 2.7209300994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_mlp": 1.05536032, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.07505314575513819, + "language_loss": 0.77450001, + "learning_rate": 0.0009018182447759136, + "loss": 0.78538495, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.33154297, + "step": 1179, + "time_per_iteration": 2.957627534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.05793107, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0724719412784609, + "language_loss": 0.79327267, + "learning_rate": 0.0009016327615772126, + "loss": 0.80419827, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.34619141, + "step": 1180, + "time_per_iteration": 2.9636237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098683, + "balance_loss_mlp": 1.06425512, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06868963719018656, + "language_loss": 0.87725425, + "learning_rate": 0.0009014471224508451, + "loss": 0.88824105, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34448242, + "step": 1181, + "time_per_iteration": 2.6756978034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101065, + "balance_loss_mlp": 1.06725717, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.08625014316755293, + "language_loss": 0.8279528, + "learning_rate": 0.0009012613274688823, + "loss": 0.83896345, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.33837891, + "step": 1182, + "time_per_iteration": 2.679690361022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106597, + "balance_loss_mlp": 1.0716213, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.07160666852762332, + "language_loss": 0.87420428, + "learning_rate": 0.0009010753767034565, + "loss": 0.8852703, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35009766, + "step": 1183, + "time_per_iteration": 2.56422758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110957, + "balance_loss_mlp": 1.07514668, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07593119142071596, + "language_loss": 0.7905606, + "learning_rate": 0.0009008892702267599, + "loss": 0.80167019, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.35839844, + "step": 1184, + "time_per_iteration": 2.96954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138099, + "balance_loss_mlp": 1.10255075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.08993468677273868, + "language_loss": 0.88719535, + "learning_rate": 0.0009007030081110457, + "loss": 0.89857626, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35571289, + "step": 1185, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.08923352, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.08461110053036625, + "language_loss": 0.84618473, + "learning_rate": 0.000900516590428627, + "loss": 0.85743326, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35668945, + "step": 1186, + "time_per_iteration": 2.6506764888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_mlp": 1.08637488, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.07299458038970587, + "language_loss": 0.89267749, + "learning_rate": 0.0009003300172518778, + "loss": 0.90388483, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34399414, + "step": 1187, + "time_per_iteration": 2.6919267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_mlp": 1.07291603, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.06786881834878318, + "language_loss": 0.83963048, + "learning_rate": 0.0009001432886532321, + "loss": 0.85070467, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.34521484, + "step": 1188, + "time_per_iteration": 2.9668681621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103209, + "balance_loss_mlp": 1.07002091, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06096375157572686, + "language_loss": 0.86560941, + "learning_rate": 0.0008999564047051843, + "loss": 0.87664151, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.33203125, + "step": 1189, + "time_per_iteration": 2.520157814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_mlp": 1.07070816, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.07257222459915597, + "language_loss": 0.84934878, + "learning_rate": 0.0008997693654802894, + "loss": 0.86038733, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.33154297, + "step": 1190, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117207, + "balance_loss_mlp": 1.08375657, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.056681488577390256, + "language_loss": 0.86392069, + "learning_rate": 0.0008995821710511625, + "loss": 0.87509274, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.3347168, + "step": 1191, + "time_per_iteration": 2.727444887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.08369398, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06323137320540088, + "language_loss": 0.85004956, + "learning_rate": 0.0008993948214904786, + "loss": 0.86121625, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.32983398, + "step": 1192, + "time_per_iteration": 2.5774295330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_mlp": 1.06097257, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.030992800338245956, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79508746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.25585938, + "step": 1193, + "time_per_iteration": 4.854384422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.08934152, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06852039575110529, + "language_loss": 0.7808823, + "learning_rate": 0.0008990196572654427, + "loss": 0.79210448, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.32861328, + "step": 1194, + "time_per_iteration": 2.873081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112553, + "balance_loss_mlp": 1.07943714, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.05701230798072306, + "language_loss": 0.87415946, + "learning_rate": 0.0008988318427467426, + "loss": 0.88528502, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.33105469, + "step": 1195, + "time_per_iteration": 2.702685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.06522477, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06940657308766013, + "language_loss": 0.85968834, + "learning_rate": 0.0008986438733877887, + "loss": 0.87066793, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.32739258, + "step": 1196, + "time_per_iteration": 3.4571969509124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096888, + "balance_loss_mlp": 1.06482017, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04726997036122248, + "language_loss": 0.83756924, + "learning_rate": 0.0008984557492615576, + "loss": 0.8485381, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.32055664, + "step": 1197, + "time_per_iteration": 2.9306819438934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090156, + "balance_loss_mlp": 1.05718327, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.05994921168989351, + "language_loss": 0.89349306, + "learning_rate": 0.0008982674704410854, + "loss": 0.90439463, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.32983398, + "step": 1198, + "time_per_iteration": 2.706496238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089604, + "balance_loss_mlp": 1.05648804, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06548245075345789, + "language_loss": 0.7739616, + "learning_rate": 0.0008980790369994682, + "loss": 0.78485769, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.33129883, + "step": 1199, + "time_per_iteration": 2.962169647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109754, + "balance_loss_mlp": 1.06375623, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.06722903582933262, + "language_loss": 0.86851013, + "learning_rate": 0.000897890449009863, + "loss": 0.87948549, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.33813477, + "step": 1200, + "time_per_iteration": 2.6820433139801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092921, + "balance_loss_mlp": 1.05877972, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.051980143810921, + "language_loss": 0.89933294, + "learning_rate": 0.0008977017065454853, + "loss": 0.91026211, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.34179688, + "step": 1201, + "time_per_iteration": 2.6699435710906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_mlp": 1.0640595, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.0699249838794834, + "language_loss": 0.80333388, + "learning_rate": 0.0008975128096796121, + "loss": 0.81432372, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34936523, + "step": 1202, + "time_per_iteration": 2.891552448272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0627346, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.08096245126913681, + "language_loss": 0.85447264, + "learning_rate": 0.0008973237584855794, + "loss": 0.86543471, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.33496094, + "step": 1203, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.06007552, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.07003086272099243, + "language_loss": 0.82261837, + "learning_rate": 0.0008971345530367832, + "loss": 0.83355689, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.33789062, + "step": 1204, + "time_per_iteration": 2.4648683071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_mlp": 1.05619669, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.0706025487590865, + "language_loss": 0.84670615, + "learning_rate": 0.0008969451934066799, + "loss": 0.85760665, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.33862305, + "step": 1205, + "time_per_iteration": 2.7628865242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096032, + "balance_loss_mlp": 1.06274843, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.07866862210425928, + "language_loss": 0.79702371, + "learning_rate": 0.0008967556796687854, + "loss": 0.80798399, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.33276367, + "step": 1206, + "time_per_iteration": 2.8876569271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099743, + "balance_loss_mlp": 1.06746101, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05955020850576899, + "language_loss": 0.83383894, + "learning_rate": 0.0008965660118966752, + "loss": 0.84483635, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.32275391, + "step": 1207, + "time_per_iteration": 2.8915722370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.06087792, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.05733195861059391, + "language_loss": 0.89860612, + "learning_rate": 0.0008963761901639851, + "loss": 0.90953553, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.32055664, + "step": 1208, + "time_per_iteration": 2.839872121810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_mlp": 1.06843603, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.0677808606719883, + "language_loss": 0.83122128, + "learning_rate": 0.0008961862145444103, + "loss": 0.84222686, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.32104492, + "step": 1209, + "time_per_iteration": 2.723395824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_mlp": 1.07726288, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06757554355714504, + "language_loss": 0.8539983, + "learning_rate": 0.0008959960851117059, + "loss": 0.86509824, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.32739258, + "step": 1210, + "time_per_iteration": 2.5843160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.08055305, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.06719057665627333, + "language_loss": 0.83744979, + "learning_rate": 0.0008958058019396868, + "loss": 0.84857744, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.32202148, + "step": 1211, + "time_per_iteration": 2.790137529373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_mlp": 1.07865953, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.061561154104104274, + "language_loss": 0.86634141, + "learning_rate": 0.0008956153651022274, + "loss": 0.877446, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.31787109, + "step": 1212, + "time_per_iteration": 2.6943769454956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107151, + "balance_loss_mlp": 1.07506013, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.056352889191353187, + "language_loss": 0.84060359, + "learning_rate": 0.0008954247746732618, + "loss": 0.85167515, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.32080078, + "step": 1213, + "time_per_iteration": 2.635540723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.07504261, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.059598265922157306, + "language_loss": 0.90450746, + "learning_rate": 0.0008952340307267837, + "loss": 0.91556644, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.30810547, + "step": 1214, + "time_per_iteration": 2.8842196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098908, + "balance_loss_mlp": 1.06817579, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.059513387141436946, + "language_loss": 0.83485198, + "learning_rate": 0.0008950431333368468, + "loss": 0.84584105, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.30688477, + "step": 1215, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098575, + "balance_loss_mlp": 1.06662679, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.05495395288746111, + "language_loss": 0.84313607, + "learning_rate": 0.0008948520825775634, + "loss": 0.85412186, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.31933594, + "step": 1216, + "time_per_iteration": 3.6454994678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099032, + "balance_loss_mlp": 1.06782317, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06066187191945671, + "language_loss": 0.83935732, + "learning_rate": 0.0008946608785231067, + "loss": 0.85034764, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.31176758, + "step": 1217, + "time_per_iteration": 2.9157872200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098088, + "balance_loss_mlp": 1.06599677, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.058216777953853424, + "language_loss": 0.84654021, + "learning_rate": 0.0008944695212477084, + "loss": 0.85752106, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.32080078, + "step": 1218, + "time_per_iteration": 2.473067045211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_mlp": 1.07158232, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.06075167680795146, + "language_loss": 0.86133409, + "learning_rate": 0.0008942780108256599, + "loss": 0.87237012, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.32006836, + "step": 1219, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_mlp": 1.06819737, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.07971641299609675, + "language_loss": 0.86269408, + "learning_rate": 0.0008940863473313121, + "loss": 0.87370056, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.32446289, + "step": 1220, + "time_per_iteration": 2.453798532485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_mlp": 1.0764761, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.07248436265958902, + "language_loss": 0.87226778, + "learning_rate": 0.0008938945308390756, + "loss": 0.88335222, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.31958008, + "step": 1221, + "time_per_iteration": 2.6299164295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092799, + "balance_loss_mlp": 1.06099391, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.0746326386118845, + "language_loss": 0.86801684, + "learning_rate": 0.00089370256142342, + "loss": 0.87894481, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.31787109, + "step": 1222, + "time_per_iteration": 2.7373716831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_mlp": 1.0675782, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.06792905088784162, + "language_loss": 0.84961808, + "learning_rate": 0.0008935104391588746, + "loss": 0.86061692, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.32299805, + "step": 1223, + "time_per_iteration": 2.786801338195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.06850326, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.053660170998325075, + "language_loss": 0.8281433, + "learning_rate": 0.0008933181641200276, + "loss": 0.83915687, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.32861328, + "step": 1224, + "time_per_iteration": 3.1502432823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102432, + "balance_loss_mlp": 1.06948209, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06465671729424353, + "language_loss": 0.85675979, + "learning_rate": 0.0008931257363815271, + "loss": 0.86778408, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.32958984, + "step": 1225, + "time_per_iteration": 2.9370880126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110561, + "balance_loss_mlp": 1.07370961, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.07282820073226746, + "language_loss": 0.89753437, + "learning_rate": 0.0008929331560180798, + "loss": 0.9085905, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.31884766, + "step": 1226, + "time_per_iteration": 2.977869749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122954, + "balance_loss_mlp": 1.09045768, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.053569811561680475, + "language_loss": 0.90818799, + "learning_rate": 0.0008927404231044525, + "loss": 0.91941756, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.32495117, + "step": 1227, + "time_per_iteration": 2.683979034423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111641, + "balance_loss_mlp": 1.07909656, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.06109587035495086, + "language_loss": 0.81612283, + "learning_rate": 0.0008925475377154703, + "loss": 0.82723922, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.32543945, + "step": 1228, + "time_per_iteration": 2.734614610671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119771, + "balance_loss_mlp": 1.08577275, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.06451716518904643, + "language_loss": 0.82344091, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463866, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.34033203, + "step": 1229, + "time_per_iteration": 2.740309000015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_mlp": 1.07561386, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.0665465772726836, + "language_loss": 0.91460836, + "learning_rate": 0.00089216130981104, + "loss": 0.92569423, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.32983398, + "step": 1230, + "time_per_iteration": 3.1343088150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_mlp": 1.07120848, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.061759964990198334, + "language_loss": 0.81970417, + "learning_rate": 0.000891967967445539, + "loss": 0.83074409, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.32788086, + "step": 1231, + "time_per_iteration": 2.67669677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100144, + "balance_loss_mlp": 1.06829166, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04660382532121484, + "language_loss": 0.88927996, + "learning_rate": 0.0008917744729045772, + "loss": 0.90028143, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.31835938, + "step": 1232, + "time_per_iteration": 2.87488055229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098328, + "balance_loss_mlp": 1.06695223, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.054845027384176535, + "language_loss": 0.83439517, + "learning_rate": 0.0008915808262632757, + "loss": 0.84537846, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.31347656, + "step": 1233, + "time_per_iteration": 2.884615659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_mlp": 1.0800519, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.058607558308664987, + "language_loss": 0.93242431, + "learning_rate": 0.0008913870275968148, + "loss": 0.94353569, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.31054688, + "step": 1234, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.07740974, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.0661901036623414, + "language_loss": 0.87537754, + "learning_rate": 0.0008911930769804342, + "loss": 0.88646448, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.3125, + "step": 1235, + "time_per_iteration": 3.247985363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_mlp": 1.08396649, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.053926277509791044, + "language_loss": 0.90842855, + "learning_rate": 0.0008909989744894318, + "loss": 0.91957957, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.31103516, + "step": 1236, + "time_per_iteration": 2.8457424640655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116546, + "balance_loss_mlp": 1.08598089, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.07410834458794652, + "language_loss": 0.81166267, + "learning_rate": 0.0008908047201991649, + "loss": 0.82282805, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.30517578, + "step": 1237, + "time_per_iteration": 2.743232011795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_mlp": 1.07218719, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.0897055957170317, + "language_loss": 0.8615526, + "learning_rate": 0.0008906103141850502, + "loss": 0.87258613, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.3112793, + "step": 1238, + "time_per_iteration": 2.8931751251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_mlp": 1.07164085, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.0595559706342315, + "language_loss": 0.87583494, + "learning_rate": 0.0008904157565225621, + "loss": 0.88686728, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.31567383, + "step": 1239, + "time_per_iteration": 2.681567430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096601, + "balance_loss_mlp": 1.06546402, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07926394914951292, + "language_loss": 0.81636947, + "learning_rate": 0.000890221047287235, + "loss": 0.82733548, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.31103516, + "step": 1240, + "time_per_iteration": 3.5042829513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096214, + "balance_loss_mlp": 1.06450391, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.06383986480013222, + "language_loss": 0.90398014, + "learning_rate": 0.0008900261865546615, + "loss": 0.91494226, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.31689453, + "step": 1241, + "time_per_iteration": 2.656243324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.06533027, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.07463092576288201, + "language_loss": 0.84907639, + "learning_rate": 0.0008898311744004936, + "loss": 0.86005968, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.33007812, + "step": 1242, + "time_per_iteration": 2.7337045669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.05583906, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.057670085451747476, + "language_loss": 0.86718595, + "learning_rate": 0.0008896360109004414, + "loss": 0.87808001, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.3359375, + "step": 1243, + "time_per_iteration": 2.6334750652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090579, + "balance_loss_mlp": 1.05667567, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.055695642571784755, + "language_loss": 0.84363699, + "learning_rate": 0.0008894406961302742, + "loss": 0.85454273, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.33935547, + "step": 1244, + "time_per_iteration": 2.612278699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092282, + "balance_loss_mlp": 1.05840266, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.053835846346086756, + "language_loss": 0.83682489, + "learning_rate": 0.0008892452301658201, + "loss": 0.84774774, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.33911133, + "step": 1245, + "time_per_iteration": 2.999476432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_mlp": 1.06169045, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.07830491582761978, + "language_loss": 0.83242297, + "learning_rate": 0.0008890496130829653, + "loss": 0.84337801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.33837891, + "step": 1246, + "time_per_iteration": 2.6750991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093391, + "balance_loss_mlp": 1.05913019, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.06104300334873528, + "language_loss": 0.85340333, + "learning_rate": 0.0008888538449576555, + "loss": 0.86433721, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.34301758, + "step": 1247, + "time_per_iteration": 2.5646800994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095388, + "balance_loss_mlp": 1.06131816, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.05789610317969602, + "language_loss": 0.82348001, + "learning_rate": 0.0008886579258658944, + "loss": 0.83443391, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.34082031, + "step": 1248, + "time_per_iteration": 2.562016487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.05283499, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.05381401206887855, + "language_loss": 0.84731787, + "learning_rate": 0.0008884618558837446, + "loss": 0.85818857, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.34277344, + "step": 1249, + "time_per_iteration": 2.8163750171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093014, + "balance_loss_mlp": 1.05927801, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.06053052424994898, + "language_loss": 0.86413568, + "learning_rate": 0.0008882656350873273, + "loss": 0.8750658, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.33764648, + "step": 1250, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088368, + "balance_loss_mlp": 1.05546594, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.06849099956300345, + "language_loss": 0.87088066, + "learning_rate": 0.0008880692635528219, + "loss": 0.88176429, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.32910156, + "step": 1251, + "time_per_iteration": 3.0528526306152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.048823, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.06290905233547327, + "language_loss": 0.88876319, + "learning_rate": 0.0008878727413564669, + "loss": 0.89957213, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.32055664, + "step": 1252, + "time_per_iteration": 2.758507251739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.05194211, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.04466256972049361, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81213295, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.2578125, + "step": 1253, + "time_per_iteration": 4.847649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05616474, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.059681429897919615, + "language_loss": 0.78408957, + "learning_rate": 0.0008874792452834528, + "loss": 0.79497254, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.32128906, + "step": 1254, + "time_per_iteration": 2.754746198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06061172, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.07362958371245172, + "language_loss": 0.87187612, + "learning_rate": 0.0008872822715595626, + "loss": 0.88279426, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.31176758, + "step": 1255, + "time_per_iteration": 2.662929058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_mlp": 1.06200314, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.08064600620778418, + "language_loss": 0.86789644, + "learning_rate": 0.0008870851474793598, + "loss": 0.87882906, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.31225586, + "step": 1256, + "time_per_iteration": 2.550830841064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06434524, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.05836545436632832, + "language_loss": 0.89218223, + "learning_rate": 0.0008868878731193752, + "loss": 0.90314561, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.31982422, + "step": 1257, + "time_per_iteration": 2.850184440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095001, + "balance_loss_mlp": 1.06400657, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.05536217997614851, + "language_loss": 0.89056414, + "learning_rate": 0.0008866904485561973, + "loss": 0.90151417, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.30957031, + "step": 1258, + "time_per_iteration": 2.7176461219787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107248, + "balance_loss_mlp": 1.0765636, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.0620425495695956, + "language_loss": 0.82697642, + "learning_rate": 0.000886492873866473, + "loss": 0.83804893, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.30639648, + "step": 1259, + "time_per_iteration": 2.881246328353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_mlp": 1.07631803, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.0764912621319216, + "language_loss": 0.84458697, + "learning_rate": 0.000886295149126908, + "loss": 0.85565412, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.3034668, + "step": 1260, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_mlp": 1.07148254, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05050860424869067, + "language_loss": 0.85437667, + "learning_rate": 0.0008860972744142655, + "loss": 0.86539763, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.30566406, + "step": 1261, + "time_per_iteration": 2.924192190170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_mlp": 1.07146263, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.05198228858732316, + "language_loss": 0.81767958, + "learning_rate": 0.0008858992498053671, + "loss": 0.82869458, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.30004883, + "step": 1262, + "time_per_iteration": 2.8300395011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069733, + "balance_loss_mlp": 1.04455626, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.04093384265265131, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77658486, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.25195312, + "step": 1263, + "time_per_iteration": 4.837641716003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_mlp": 1.07217157, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05948216339756903, + "language_loss": 0.83247912, + "learning_rate": 0.0008855027512063817, + "loss": 0.84351087, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.30957031, + "step": 1264, + "time_per_iteration": 2.7277276515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102812, + "balance_loss_mlp": 1.07191277, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06194442365761257, + "language_loss": 0.8589493, + "learning_rate": 0.0008853042773702292, + "loss": 0.86997747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.30859375, + "step": 1265, + "time_per_iteration": 2.7305567264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_mlp": 1.07197642, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.0568893751116151, + "language_loss": 0.87145638, + "learning_rate": 0.0008851056539456896, + "loss": 0.88248914, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.31274414, + "step": 1266, + "time_per_iteration": 2.6886072158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.06767774, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.06669847345827673, + "language_loss": 0.81623918, + "learning_rate": 0.0008849068810098755, + "loss": 0.82723451, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.31835938, + "step": 1267, + "time_per_iteration": 3.302135705947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_mlp": 1.06049967, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.06302829877877653, + "language_loss": 0.82764143, + "learning_rate": 0.0008847079586399575, + "loss": 0.83856159, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.31494141, + "step": 1268, + "time_per_iteration": 2.469602584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.05755162, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.062034835544456234, + "language_loss": 0.85665154, + "learning_rate": 0.0008845088869131641, + "loss": 0.86753917, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.31176758, + "step": 1269, + "time_per_iteration": 2.6822941303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090407, + "balance_loss_mlp": 1.05864954, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.06778965234687388, + "language_loss": 0.88905638, + "learning_rate": 0.0008843096659067818, + "loss": 0.8999604, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.31738281, + "step": 1270, + "time_per_iteration": 2.594064235687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05697237066827103, + "language_loss": 0.85987377, + "learning_rate": 0.000884110295698155, + "loss": 0.87074518, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.31567383, + "step": 1271, + "time_per_iteration": 2.974696636199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.0512805, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.06068289501227115, + "language_loss": 0.85902673, + "learning_rate": 0.0008839107763646861, + "loss": 0.86986518, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.32568359, + "step": 1272, + "time_per_iteration": 2.607771158218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_mlp": 1.0507555, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.061464799303267155, + "language_loss": 0.9008882, + "learning_rate": 0.0008837111079838353, + "loss": 0.91174459, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.34912109, + "step": 1273, + "time_per_iteration": 2.708512306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0463264, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.06335862765515422, + "language_loss": 0.89847112, + "learning_rate": 0.000883511290633121, + "loss": 0.9092629, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.32861328, + "step": 1274, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.04423904, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04937694398035677, + "language_loss": 0.92408085, + "learning_rate": 0.000883311324390119, + "loss": 0.93485993, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.33691406, + "step": 1275, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.0457077, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.07292672859625873, + "language_loss": 0.80929816, + "learning_rate": 0.0008831112093324629, + "loss": 0.82010162, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.34667969, + "step": 1276, + "time_per_iteration": 3.0507287979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.04209912, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0707858001482728, + "language_loss": 0.88982868, + "learning_rate": 0.0008829109455378444, + "loss": 0.90059322, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.34375, + "step": 1277, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.04284549, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05561589900472309, + "language_loss": 0.86233819, + "learning_rate": 0.000882710533084013, + "loss": 0.87310779, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.34155273, + "step": 1278, + "time_per_iteration": 2.623353958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074564, + "balance_loss_mlp": 1.04013681, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04936271772538766, + "language_loss": 0.89139968, + "learning_rate": 0.0008825099720487755, + "loss": 0.90214527, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.34448242, + "step": 1279, + "time_per_iteration": 2.6549813747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069233, + "balance_loss_mlp": 1.04853857, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.028817901818472227, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76330376, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.20703125, + "step": 1280, + "time_per_iteration": 4.85357141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_mlp": 1.04521215, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.026145975527968417, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79010111, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.20800781, + "step": 1281, + "time_per_iteration": 4.780989408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_mlp": 1.04983163, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.06975718656823436, + "language_loss": 0.89050984, + "learning_rate": 0.0008819073982335619, + "loss": 0.90134096, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.33300781, + "step": 1282, + "time_per_iteration": 2.8345205783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05361331, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.062337694406813374, + "language_loss": 0.84269708, + "learning_rate": 0.0008817062436519235, + "loss": 0.85355437, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.32104492, + "step": 1283, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089504, + "balance_loss_mlp": 1.05612516, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.06365108043104846, + "language_loss": 0.89943874, + "learning_rate": 0.0008815049408787788, + "loss": 0.91033375, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.33398438, + "step": 1284, + "time_per_iteration": 2.5116872787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.04916823, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.059551230096427064, + "language_loss": 0.85302055, + "learning_rate": 0.0008813034899922805, + "loss": 0.86383736, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.32519531, + "step": 1285, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080955, + "balance_loss_mlp": 1.04931688, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06660544793665324, + "language_loss": 0.89506048, + "learning_rate": 0.0008811018910706387, + "loss": 0.90586996, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.31616211, + "step": 1286, + "time_per_iteration": 2.552616834640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_mlp": 1.04756403, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.07038813341767636, + "language_loss": 0.81879961, + "learning_rate": 0.0008809001441921211, + "loss": 0.82959306, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.31762695, + "step": 1287, + "time_per_iteration": 2.704249143600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082412, + "balance_loss_mlp": 1.05132163, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.054805193397824324, + "language_loss": 0.85345185, + "learning_rate": 0.0008806982494350528, + "loss": 0.86427593, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.31054688, + "step": 1288, + "time_per_iteration": 2.65993070602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.05359983, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.05430799794632807, + "language_loss": 0.90285796, + "learning_rate": 0.0008804962068778161, + "loss": 0.91370773, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.31347656, + "step": 1289, + "time_per_iteration": 2.8633711338043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_mlp": 1.05515075, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.06485439157304855, + "language_loss": 0.81069577, + "learning_rate": 0.0008802940165988511, + "loss": 0.82155788, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.31030273, + "step": 1290, + "time_per_iteration": 2.877063274383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_mlp": 1.05341625, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.058113292585204916, + "language_loss": 0.88358063, + "learning_rate": 0.000880091678676655, + "loss": 0.89442384, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.30859375, + "step": 1291, + "time_per_iteration": 2.800182342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088307, + "balance_loss_mlp": 1.05814719, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.05744202885681841, + "language_loss": 0.88709044, + "learning_rate": 0.0008798891931897821, + "loss": 0.89797354, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.30126953, + "step": 1292, + "time_per_iteration": 2.8186981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06009781, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.06335011869227863, + "language_loss": 0.84085584, + "learning_rate": 0.0008796865602168447, + "loss": 0.85176343, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.30615234, + "step": 1293, + "time_per_iteration": 2.5642354488372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06218874, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.055204532335327836, + "language_loss": 0.88449144, + "learning_rate": 0.0008794837798365115, + "loss": 0.89542329, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.30957031, + "step": 1294, + "time_per_iteration": 2.640967607498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_mlp": 1.07256651, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05342912575045942, + "language_loss": 0.88282919, + "learning_rate": 0.0008792808521275089, + "loss": 0.8938638, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.30859375, + "step": 1295, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106969, + "balance_loss_mlp": 1.07638037, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.05542201073335728, + "language_loss": 0.87427896, + "learning_rate": 0.0008790777771686206, + "loss": 0.88534868, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.30541992, + "step": 1296, + "time_per_iteration": 2.5764553546905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_mlp": 1.07934809, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.061211557913471215, + "language_loss": 0.85332036, + "learning_rate": 0.0008788745550386872, + "loss": 0.86441755, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.30322266, + "step": 1297, + "time_per_iteration": 2.635064125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_mlp": 1.08226037, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.055423812451341224, + "language_loss": 0.79893327, + "learning_rate": 0.0008786711858166063, + "loss": 0.81006682, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.31054688, + "step": 1298, + "time_per_iteration": 3.002070903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113917, + "balance_loss_mlp": 1.08387578, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.06342841372026603, + "language_loss": 0.8358891, + "learning_rate": 0.0008784676695813332, + "loss": 0.84702826, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.29980469, + "step": 1299, + "time_per_iteration": 2.941793918609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116177, + "balance_loss_mlp": 1.08573055, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.05313888632052142, + "language_loss": 0.84205985, + "learning_rate": 0.0008782640064118796, + "loss": 0.85322165, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.30395508, + "step": 1300, + "time_per_iteration": 2.9038445949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113921, + "balance_loss_mlp": 1.11441469, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.03742785755303804, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323961, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.24804688, + "step": 1301, + "time_per_iteration": 4.97193169593811 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.0781548, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.06725713094725487, + "language_loss": 0.86707664, + "learning_rate": 0.0008778562395867648, + "loss": 0.87815738, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.29882812, + "step": 1302, + "time_per_iteration": 2.6434335708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109494, + "balance_loss_mlp": 1.064852, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.0573305289073435, + "language_loss": 0.83713615, + "learning_rate": 0.0008776521360894127, + "loss": 0.84808552, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.30029297, + "step": 1303, + "time_per_iteration": 2.664281129837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_mlp": 1.06206167, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.030879512397293623, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80049491, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.25390625, + "step": 1304, + "time_per_iteration": 4.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_mlp": 1.06682515, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.05889583885024225, + "language_loss": 0.90380585, + "learning_rate": 0.0008772434893213186, + "loss": 0.91477358, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.29882812, + "step": 1305, + "time_per_iteration": 2.619591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.06228364, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.05643683756415757, + "language_loss": 0.84055364, + "learning_rate": 0.0008770389462092276, + "loss": 0.85148358, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.30664062, + "step": 1306, + "time_per_iteration": 2.646378517150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_mlp": 1.05860949, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.07421628365380602, + "language_loss": 0.86343837, + "learning_rate": 0.0008768342567176357, + "loss": 0.87434107, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.31640625, + "step": 1307, + "time_per_iteration": 2.807349681854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_mlp": 1.0562675, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.06024308313144323, + "language_loss": 0.90521109, + "learning_rate": 0.0008766294209260107, + "loss": 0.91610324, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.32958984, + "step": 1308, + "time_per_iteration": 2.652209758758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_mlp": 1.05510211, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.07044022402077256, + "language_loss": 0.90948963, + "learning_rate": 0.0008764244389138767, + "loss": 0.92035961, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.31884766, + "step": 1309, + "time_per_iteration": 2.583214044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05386305, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.07007920023055086, + "language_loss": 0.82157373, + "learning_rate": 0.000876219310760815, + "loss": 0.83244258, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.33032227, + "step": 1310, + "time_per_iteration": 2.8652145862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_mlp": 1.05956042, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05921747328918915, + "language_loss": 0.81032491, + "learning_rate": 0.0008760140365464631, + "loss": 0.82124686, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.32641602, + "step": 1311, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05799365, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06933033432447253, + "language_loss": 0.87204492, + "learning_rate": 0.0008758086163505156, + "loss": 0.88295335, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.32861328, + "step": 1312, + "time_per_iteration": 2.5809056758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085438, + "balance_loss_mlp": 1.05253649, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.05785086559723577, + "language_loss": 0.89221275, + "learning_rate": 0.0008756030502527239, + "loss": 0.90306717, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.32910156, + "step": 1313, + "time_per_iteration": 2.8305885791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_mlp": 1.05201209, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05540107069612798, + "language_loss": 0.90540659, + "learning_rate": 0.0008753973383328954, + "loss": 0.91624713, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.3203125, + "step": 1314, + "time_per_iteration": 2.8095338344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_mlp": 1.0518887, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.06960735937341114, + "language_loss": 0.83534479, + "learning_rate": 0.0008751914806708952, + "loss": 0.84618747, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.32373047, + "step": 1315, + "time_per_iteration": 2.6356046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_mlp": 1.05357838, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.05966295966929829, + "language_loss": 0.82178831, + "learning_rate": 0.0008749854773466439, + "loss": 0.83263648, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.31201172, + "step": 1316, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083614, + "balance_loss_mlp": 1.05199969, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.060440864571565875, + "language_loss": 0.84378719, + "learning_rate": 0.0008747793284401192, + "loss": 0.85462332, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.31591797, + "step": 1317, + "time_per_iteration": 2.672581195831299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04701352, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.06760844062466466, + "language_loss": 0.85858786, + "learning_rate": 0.0008745730340313551, + "loss": 0.8693741, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.31591797, + "step": 1318, + "time_per_iteration": 2.7483184337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_mlp": 1.05775118, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.06356165501521222, + "language_loss": 0.84280074, + "learning_rate": 0.0008743665942004422, + "loss": 0.85368681, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.30834961, + "step": 1319, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094218, + "balance_loss_mlp": 1.06362879, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.06511177952096096, + "language_loss": 0.92719352, + "learning_rate": 0.0008741600090275277, + "loss": 0.93813574, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.30541992, + "step": 1320, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_mlp": 1.05758274, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.06459884228420558, + "language_loss": 0.84290528, + "learning_rate": 0.0008739532785928151, + "loss": 0.853791, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.30957031, + "step": 1321, + "time_per_iteration": 3.438142776489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166929, + "balance_loss_mlp": 1.14528096, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.062216562760273944, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7606051, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.21679688, + "step": 1322, + "time_per_iteration": 4.881207466125488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109523, + "balance_loss_mlp": 1.06502271, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.0660267567978659, + "language_loss": 0.8296389, + "learning_rate": 0.0008735393822590908, + "loss": 0.84059119, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.30151367, + "step": 1323, + "time_per_iteration": 2.7254581451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_mlp": 1.06723142, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.07409821223339019, + "language_loss": 0.87412238, + "learning_rate": 0.0008733322165207681, + "loss": 0.88509512, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.30029297, + "step": 1324, + "time_per_iteration": 2.6910648345947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_mlp": 1.07295775, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.06686348955430095, + "language_loss": 0.83012944, + "learning_rate": 0.0008731249058420247, + "loss": 0.84115636, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.29663086, + "step": 1325, + "time_per_iteration": 3.0301432609558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_mlp": 1.07499993, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.057218587703981125, + "language_loss": 0.90547103, + "learning_rate": 0.0008729174503033459, + "loss": 0.91652811, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.30664062, + "step": 1326, + "time_per_iteration": 2.668544292449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07706285, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.08872727493885958, + "language_loss": 0.82430828, + "learning_rate": 0.0008727098499852728, + "loss": 0.83538437, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.30493164, + "step": 1327, + "time_per_iteration": 2.8206427097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102439, + "balance_loss_mlp": 1.07175469, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.05995612334517853, + "language_loss": 0.8945381, + "learning_rate": 0.0008725021049684034, + "loss": 0.90556252, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.30639648, + "step": 1328, + "time_per_iteration": 2.7788021564483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110018, + "balance_loss_mlp": 1.06906641, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.07693053452424695, + "language_loss": 0.82675111, + "learning_rate": 0.000872294215333391, + "loss": 0.83775294, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.31079102, + "step": 1329, + "time_per_iteration": 3.208423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089607, + "balance_loss_mlp": 1.05820751, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05833009001407562, + "language_loss": 0.83099753, + "learning_rate": 0.0008720861811609457, + "loss": 0.84189361, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.3137207, + "step": 1330, + "time_per_iteration": 2.723451614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082701, + "balance_loss_mlp": 1.05122948, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.06841234134213905, + "language_loss": 0.83759737, + "learning_rate": 0.0008718780025318338, + "loss": 0.84842432, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.31445312, + "step": 1331, + "time_per_iteration": 2.7594637870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.05244088, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.059488371229756976, + "language_loss": 0.83890998, + "learning_rate": 0.0008716696795268771, + "loss": 0.84975058, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.31591797, + "step": 1332, + "time_per_iteration": 2.719435453414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.05516648, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.09040651922247907, + "language_loss": 0.85621184, + "learning_rate": 0.0008714612122269538, + "loss": 0.86707628, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.3125, + "step": 1333, + "time_per_iteration": 2.846071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087221, + "balance_loss_mlp": 1.05517721, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.06079891504044088, + "language_loss": 0.8881824, + "learning_rate": 0.0008712526007129982, + "loss": 0.89905459, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.3203125, + "step": 1334, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_mlp": 1.05226636, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06135189476637687, + "language_loss": 0.90600282, + "learning_rate": 0.0008710438450660003, + "loss": 0.91684425, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.31835938, + "step": 1335, + "time_per_iteration": 2.6957638263702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_mlp": 1.04984844, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.09152684925001835, + "language_loss": 0.86861122, + "learning_rate": 0.0008708349453670064, + "loss": 0.87942821, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.31835938, + "step": 1336, + "time_per_iteration": 2.569918632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.04854655, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.055029840901202824, + "language_loss": 0.91123867, + "learning_rate": 0.0008706259016971185, + "loss": 0.92204076, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.31640625, + "step": 1337, + "time_per_iteration": 2.7755186557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_mlp": 1.04554725, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.08019888390454845, + "language_loss": 0.82668757, + "learning_rate": 0.0008704167141374944, + "loss": 0.83746326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.32006836, + "step": 1338, + "time_per_iteration": 2.8559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073148, + "balance_loss_mlp": 1.04184318, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06412343972447931, + "language_loss": 0.88389909, + "learning_rate": 0.0008702073827693482, + "loss": 0.89463055, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.31274414, + "step": 1339, + "time_per_iteration": 2.725090265274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_mlp": 1.04662943, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06471871877048396, + "language_loss": 0.88798392, + "learning_rate": 0.0008699979076739494, + "loss": 0.89876378, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.31323242, + "step": 1340, + "time_per_iteration": 2.9663493633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.04354882, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.0844279622703065, + "language_loss": 0.88438749, + "learning_rate": 0.0008697882889326234, + "loss": 0.89513433, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.31103516, + "step": 1341, + "time_per_iteration": 2.5622262954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05047798, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.07114901487039385, + "language_loss": 0.86560714, + "learning_rate": 0.0008695785266267515, + "loss": 0.87642074, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.30834961, + "step": 1342, + "time_per_iteration": 2.7169957160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_mlp": 1.05309629, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06303738321086937, + "language_loss": 0.82804394, + "learning_rate": 0.0008693686208377704, + "loss": 0.83887577, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.30053711, + "step": 1343, + "time_per_iteration": 2.8591935634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_mlp": 1.06142426, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06465186244058573, + "language_loss": 0.88812125, + "learning_rate": 0.0008691585716471733, + "loss": 0.89902723, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.29150391, + "step": 1344, + "time_per_iteration": 2.6713430881500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099449, + "balance_loss_mlp": 1.07119632, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.0588719911399204, + "language_loss": 0.85261089, + "learning_rate": 0.0008689483791365079, + "loss": 0.86360538, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.28271484, + "step": 1345, + "time_per_iteration": 2.820528030395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.08457518, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.06280839806958106, + "language_loss": 0.89176255, + "learning_rate": 0.0008687380433873786, + "loss": 0.90288818, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.28027344, + "step": 1346, + "time_per_iteration": 2.8161351680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122151, + "balance_loss_mlp": 1.09442306, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.09019918884346267, + "language_loss": 0.82469404, + "learning_rate": 0.0008685275644814448, + "loss": 0.83591551, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.27734375, + "step": 1347, + "time_per_iteration": 2.693267822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_mlp": 1.09403384, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.0763626786758855, + "language_loss": 0.83996952, + "learning_rate": 0.0008683169425004216, + "loss": 0.85119361, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.28393555, + "step": 1348, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.07582057, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.0999879699530973, + "language_loss": 0.82942533, + "learning_rate": 0.0008681061775260799, + "loss": 0.84046841, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.28491211, + "step": 1349, + "time_per_iteration": 2.8389806747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104623, + "balance_loss_mlp": 1.0761795, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06848449496170159, + "language_loss": 0.9182089, + "learning_rate": 0.0008678952696402458, + "loss": 0.92925513, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.28442383, + "step": 1350, + "time_per_iteration": 2.520573377609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091244, + "balance_loss_mlp": 1.06270587, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.06363942150358032, + "language_loss": 0.86753285, + "learning_rate": 0.000867684218924801, + "loss": 0.87844533, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.28564453, + "step": 1351, + "time_per_iteration": 2.9015109539031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094999, + "balance_loss_mlp": 1.07382762, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.03643594447100183, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80042088, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.21191406, + "step": 1352, + "time_per_iteration": 4.897913217544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05987692, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.05004222260192376, + "language_loss": 0.8488791, + "learning_rate": 0.0008672616893328834, + "loss": 0.85977256, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.29394531, + "step": 1353, + "time_per_iteration": 2.930330991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_mlp": 1.05925155, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.06508424080641521, + "language_loss": 0.90170342, + "learning_rate": 0.0008670502106204512, + "loss": 0.91259539, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.29882812, + "step": 1354, + "time_per_iteration": 2.8581433296203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088042, + "balance_loss_mlp": 1.05821621, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.07357469643966064, + "language_loss": 0.81904948, + "learning_rate": 0.0008668385894064892, + "loss": 0.82992983, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.2980957, + "step": 1355, + "time_per_iteration": 2.6258199214935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086225, + "balance_loss_mlp": 1.05565977, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.05598612189883674, + "language_loss": 0.88435078, + "learning_rate": 0.0008666268257731562, + "loss": 0.89521307, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.30517578, + "step": 1356, + "time_per_iteration": 3.0935704708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_mlp": 1.06557548, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.05877228431721195, + "language_loss": 0.85582316, + "learning_rate": 0.0008664149198026662, + "loss": 0.86678505, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.3059082, + "step": 1357, + "time_per_iteration": 3.3150172233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093826, + "balance_loss_mlp": 1.06407189, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.08010917030088013, + "language_loss": 0.88609982, + "learning_rate": 0.0008662028715772883, + "loss": 0.8970381, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.29736328, + "step": 1358, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117948, + "balance_loss_mlp": 1.08781219, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.068011575409632, + "language_loss": 0.8599565, + "learning_rate": 0.0008659906811793467, + "loss": 0.87113595, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.30078125, + "step": 1359, + "time_per_iteration": 2.6895272731781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120144, + "balance_loss_mlp": 1.08917356, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06541737550876531, + "language_loss": 0.89626461, + "learning_rate": 0.0008657783486912215, + "loss": 0.90746599, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.30932617, + "step": 1360, + "time_per_iteration": 2.762763738632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_mlp": 1.09752679, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.08393806981558949, + "language_loss": 0.89884281, + "learning_rate": 0.0008655658741953472, + "loss": 0.91012919, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.31079102, + "step": 1361, + "time_per_iteration": 3.2099156379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108189, + "balance_loss_mlp": 1.07740927, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.05266132623937494, + "language_loss": 0.88221049, + "learning_rate": 0.0008653532577742136, + "loss": 0.89329231, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.30761719, + "step": 1362, + "time_per_iteration": 2.6699323654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097872, + "balance_loss_mlp": 1.06756878, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.06436829867728516, + "language_loss": 0.86740243, + "learning_rate": 0.0008651404995103659, + "loss": 0.87838113, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.30273438, + "step": 1363, + "time_per_iteration": 2.5310258865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094148, + "balance_loss_mlp": 1.06286716, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.05795299669830668, + "language_loss": 0.8642996, + "learning_rate": 0.0008649275994864041, + "loss": 0.87524116, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.3125, + "step": 1364, + "time_per_iteration": 2.675330638885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_mlp": 1.07066512, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05147405231292679, + "language_loss": 0.83778602, + "learning_rate": 0.0008647145577849834, + "loss": 0.84880447, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.31152344, + "step": 1365, + "time_per_iteration": 2.817330837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06913614, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.05119291352940178, + "language_loss": 0.82886052, + "learning_rate": 0.0008645013744888139, + "loss": 0.83985633, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.30395508, + "step": 1366, + "time_per_iteration": 2.9056894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093325, + "balance_loss_mlp": 1.06318903, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.08887633390516779, + "language_loss": 0.8772788, + "learning_rate": 0.0008642880496806607, + "loss": 0.88821203, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.30102539, + "step": 1367, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.0635649, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.0720053964715196, + "language_loss": 0.84128964, + "learning_rate": 0.0008640745834433437, + "loss": 0.85223687, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.3112793, + "step": 1368, + "time_per_iteration": 2.7703893184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_mlp": 1.05559897, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.058958451803685384, + "language_loss": 0.86905044, + "learning_rate": 0.000863860975859738, + "loss": 0.87990516, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.29833984, + "step": 1369, + "time_per_iteration": 2.913543224334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06309724, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.07885033776141591, + "language_loss": 0.87845421, + "learning_rate": 0.0008636472270127733, + "loss": 0.8893891, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.3034668, + "step": 1370, + "time_per_iteration": 2.6615941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093443, + "balance_loss_mlp": 1.06368852, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.06686078076555955, + "language_loss": 0.90047085, + "learning_rate": 0.0008634333369854345, + "loss": 0.91140521, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.29736328, + "step": 1371, + "time_per_iteration": 2.611501932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109652, + "balance_loss_mlp": 1.06666958, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05135890593758564, + "language_loss": 0.87519878, + "learning_rate": 0.0008632193058607608, + "loss": 0.88616395, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.29833984, + "step": 1372, + "time_per_iteration": 2.7420408725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096239, + "balance_loss_mlp": 1.06681848, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.07070265457366111, + "language_loss": 0.80896008, + "learning_rate": 0.0008630051337218466, + "loss": 0.81992251, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.29394531, + "step": 1373, + "time_per_iteration": 2.694157123565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097092, + "balance_loss_mlp": 1.06762338, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.06318549857397857, + "language_loss": 0.8188293, + "learning_rate": 0.0008627908206518409, + "loss": 0.82980019, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.29418945, + "step": 1374, + "time_per_iteration": 2.703380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_mlp": 1.00330341, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.017765090827900253, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76174676, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.20117188, + "step": 1375, + "time_per_iteration": 4.995063781738281 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06237197, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.0561933760173491, + "language_loss": 0.9114545, + "learning_rate": 0.0008623617720514241, + "loss": 0.92238057, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.30224609, + "step": 1376, + "time_per_iteration": 2.666578769683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093572, + "balance_loss_mlp": 1.06276798, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.06268473823371516, + "language_loss": 0.84907627, + "learning_rate": 0.0008621470366875848, + "loss": 0.86001205, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.30761719, + "step": 1377, + "time_per_iteration": 2.576968193054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087995, + "balance_loss_mlp": 1.05661869, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05801174228437736, + "language_loss": 0.87514544, + "learning_rate": 0.0008619321607257966, + "loss": 0.88602537, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.31347656, + "step": 1378, + "time_per_iteration": 2.6873912811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05396187, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.06612008054140536, + "language_loss": 0.81601393, + "learning_rate": 0.000861717144249482, + "loss": 0.82685226, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.2980957, + "step": 1379, + "time_per_iteration": 2.861531972885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082319, + "balance_loss_mlp": 1.05220687, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06041061044303736, + "language_loss": 0.89415485, + "learning_rate": 0.0008615019873421175, + "loss": 0.90497804, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.30053711, + "step": 1380, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080185, + "balance_loss_mlp": 1.04973865, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.12029414194163875, + "language_loss": 0.85435975, + "learning_rate": 0.0008612866900872349, + "loss": 0.86516166, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.30395508, + "step": 1381, + "time_per_iteration": 2.5492422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078246, + "balance_loss_mlp": 1.0483005, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.06111803920627532, + "language_loss": 0.87957448, + "learning_rate": 0.0008610712525684197, + "loss": 0.89035696, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.29882812, + "step": 1382, + "time_per_iteration": 2.632847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_mlp": 1.05356061, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.07781171288722535, + "language_loss": 0.84130585, + "learning_rate": 0.0008608556748693121, + "loss": 0.85214543, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.3034668, + "step": 1383, + "time_per_iteration": 3.246919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.05522013, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.052993237489823604, + "language_loss": 0.85963714, + "learning_rate": 0.000860639957073607, + "loss": 0.87050641, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.31689453, + "step": 1384, + "time_per_iteration": 2.7504889965057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086729, + "balance_loss_mlp": 1.05537665, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.06878538642870029, + "language_loss": 0.87610686, + "learning_rate": 0.0008604240992650534, + "loss": 0.88697416, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.31347656, + "step": 1385, + "time_per_iteration": 2.6546881198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082661, + "balance_loss_mlp": 1.05135679, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.05853696199287041, + "language_loss": 0.89197159, + "learning_rate": 0.0008602081015274545, + "loss": 0.90279818, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.31274414, + "step": 1386, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091919, + "balance_loss_mlp": 1.06061459, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.05264786586341277, + "language_loss": 0.83147365, + "learning_rate": 0.0008599919639446684, + "loss": 0.8423928, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.31274414, + "step": 1387, + "time_per_iteration": 2.6775026321411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_mlp": 1.06126583, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06747698326814106, + "language_loss": 0.79790741, + "learning_rate": 0.000859775686600607, + "loss": 0.80884051, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.3203125, + "step": 1388, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090634, + "balance_loss_mlp": 1.05921042, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.06336986871451572, + "language_loss": 0.84764999, + "learning_rate": 0.0008595592695792367, + "loss": 0.85855639, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.31396484, + "step": 1389, + "time_per_iteration": 2.6549055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.06593931, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.055901377362424544, + "language_loss": 0.90619266, + "learning_rate": 0.0008593427129645778, + "loss": 0.91716409, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.31176758, + "step": 1390, + "time_per_iteration": 2.6070477962493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_mlp": 1.06542134, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.06788313950064188, + "language_loss": 0.85213327, + "learning_rate": 0.0008591260168407052, + "loss": 0.86309791, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.31005859, + "step": 1391, + "time_per_iteration": 2.794921398162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_mlp": 1.05963671, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.052723370404498295, + "language_loss": 0.82993329, + "learning_rate": 0.0008589091812917479, + "loss": 0.84085703, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.32739258, + "step": 1392, + "time_per_iteration": 2.634734869003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088674, + "balance_loss_mlp": 1.05727446, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.06846284491975779, + "language_loss": 0.85420829, + "learning_rate": 0.0008586922064018887, + "loss": 0.86509502, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.3137207, + "step": 1393, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108591, + "balance_loss_mlp": 1.05408156, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.07721778370466406, + "language_loss": 0.89049023, + "learning_rate": 0.0008584750922553651, + "loss": 0.90134937, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.31811523, + "step": 1394, + "time_per_iteration": 3.15010666847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_mlp": 1.05053067, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.054821616219537066, + "language_loss": 0.83275163, + "learning_rate": 0.0008582578389364677, + "loss": 0.8435728, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.31567383, + "step": 1395, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086932, + "balance_loss_mlp": 1.05469775, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.049938668546041676, + "language_loss": 0.91772366, + "learning_rate": 0.0008580404465295422, + "loss": 0.92859298, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.32226562, + "step": 1396, + "time_per_iteration": 2.8488125801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_mlp": 1.04891562, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.06204428603549851, + "language_loss": 0.87966394, + "learning_rate": 0.0008578229151189876, + "loss": 0.89045662, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.30297852, + "step": 1397, + "time_per_iteration": 2.92258620262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081241, + "balance_loss_mlp": 1.04867268, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.06429333021146523, + "language_loss": 0.81249309, + "learning_rate": 0.0008576052447892573, + "loss": 0.82330555, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.32568359, + "step": 1398, + "time_per_iteration": 2.551042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.05163908, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.0671833421183549, + "language_loss": 0.86040235, + "learning_rate": 0.000857387435624858, + "loss": 0.87124133, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.32250977, + "step": 1399, + "time_per_iteration": 2.5816056728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086843, + "balance_loss_mlp": 1.05382252, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.05003222473195782, + "language_loss": 0.87953913, + "learning_rate": 0.0008571694877103513, + "loss": 0.89040762, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.33032227, + "step": 1400, + "time_per_iteration": 3.256469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108756, + "balance_loss_mlp": 1.05542135, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.056643414184275494, + "language_loss": 0.87665725, + "learning_rate": 0.0008569514011303515, + "loss": 0.88753277, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.32128906, + "step": 1401, + "time_per_iteration": 2.782273054122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_mlp": 1.05275857, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06127144796082157, + "language_loss": 0.8767277, + "learning_rate": 0.0008567331759695277, + "loss": 0.88757378, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.31835938, + "step": 1402, + "time_per_iteration": 2.696514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_mlp": 1.05178595, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.07491599518741582, + "language_loss": 0.86524475, + "learning_rate": 0.0008565148123126023, + "loss": 0.87609023, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.32763672, + "step": 1403, + "time_per_iteration": 2.6686785221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088194, + "balance_loss_mlp": 1.05510116, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.050644669708274456, + "language_loss": 0.8574301, + "learning_rate": 0.0008562963102443516, + "loss": 0.86831206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.33105469, + "step": 1404, + "time_per_iteration": 2.693836212158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05232334, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.06951419199959312, + "language_loss": 0.84958577, + "learning_rate": 0.0008560776698496056, + "loss": 0.8604449, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.33618164, + "step": 1405, + "time_per_iteration": 2.892805814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_mlp": 1.05093896, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.07287556066439085, + "language_loss": 0.85794389, + "learning_rate": 0.0008558588912132481, + "loss": 0.8687861, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.33300781, + "step": 1406, + "time_per_iteration": 2.821922540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098005, + "balance_loss_mlp": 1.07587957, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.044578698770804955, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77556992, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.22167969, + "step": 1407, + "time_per_iteration": 4.952622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082949, + "balance_loss_mlp": 1.05016637, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.05991157104862915, + "language_loss": 0.82959783, + "learning_rate": 0.0008554209195555016, + "loss": 0.84042734, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.32788086, + "step": 1408, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_mlp": 1.05403042, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.06960051295953752, + "language_loss": 0.88047969, + "learning_rate": 0.0008552017267041483, + "loss": 0.89133757, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.31738281, + "step": 1409, + "time_per_iteration": 2.7926084995269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_mlp": 1.06134176, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.07424010893339522, + "language_loss": 0.8324914, + "learning_rate": 0.0008549823959512549, + "loss": 0.8434236, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.31860352, + "step": 1410, + "time_per_iteration": 2.660325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.06724083, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.062062202361739795, + "language_loss": 0.86755967, + "learning_rate": 0.0008547629273819728, + "loss": 0.87854296, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.31054688, + "step": 1411, + "time_per_iteration": 3.3994545936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098737, + "balance_loss_mlp": 1.06736147, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06335672358829844, + "language_loss": 0.83453959, + "learning_rate": 0.0008545433210815074, + "loss": 0.84552693, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.31347656, + "step": 1412, + "time_per_iteration": 2.644434690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_mlp": 1.07123613, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.06340025797507488, + "language_loss": 0.87345338, + "learning_rate": 0.0008543235771351176, + "loss": 0.88448215, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.31616211, + "step": 1413, + "time_per_iteration": 2.7854721546173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098411, + "balance_loss_mlp": 1.0675596, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.05399278560092938, + "language_loss": 0.84545946, + "learning_rate": 0.0008541036956281154, + "loss": 0.85644352, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.30834961, + "step": 1414, + "time_per_iteration": 2.8788704872131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_mlp": 1.06056201, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.07883268546047513, + "language_loss": 0.81883514, + "learning_rate": 0.0008538836766458665, + "loss": 0.82975471, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.3137207, + "step": 1415, + "time_per_iteration": 2.8526153564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087599, + "balance_loss_mlp": 1.05732012, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.060849568603238105, + "language_loss": 0.84889638, + "learning_rate": 0.0008536635202737897, + "loss": 0.85977244, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.30224609, + "step": 1416, + "time_per_iteration": 2.837353467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_mlp": 1.05903983, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.07898075745209039, + "language_loss": 0.82057679, + "learning_rate": 0.0008534432265973573, + "loss": 0.83147448, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.30688477, + "step": 1417, + "time_per_iteration": 2.5948355197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091815, + "balance_loss_mlp": 1.05891299, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.06605458024108496, + "language_loss": 0.87714171, + "learning_rate": 0.000853222795702095, + "loss": 0.88805991, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.32910156, + "step": 1418, + "time_per_iteration": 3.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109188, + "balance_loss_mlp": 1.05842948, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.04642939327926388, + "language_loss": 0.83471483, + "learning_rate": 0.0008530022276735813, + "loss": 0.84563363, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.33447266, + "step": 1419, + "time_per_iteration": 2.711695432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_mlp": 1.05293703, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.05938997521105461, + "language_loss": 0.85724676, + "learning_rate": 0.0008527815225974489, + "loss": 0.86811179, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.3359375, + "step": 1420, + "time_per_iteration": 2.648448944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_mlp": 1.05407453, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.07492898694353861, + "language_loss": 0.87982917, + "learning_rate": 0.0008525606805593829, + "loss": 0.89069438, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.32446289, + "step": 1421, + "time_per_iteration": 2.4182560443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_mlp": 1.04997277, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.06962089633364145, + "language_loss": 0.82760686, + "learning_rate": 0.0008523397016451213, + "loss": 0.83843112, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.32446289, + "step": 1422, + "time_per_iteration": 2.587892532348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05021799, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.053513553181154576, + "language_loss": 0.8711561, + "learning_rate": 0.0008521185859404564, + "loss": 0.88199091, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.33276367, + "step": 1423, + "time_per_iteration": 3.372192859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_mlp": 1.0513202, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.059986100163812936, + "language_loss": 0.89238524, + "learning_rate": 0.0008518973335312326, + "loss": 0.90323293, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.33447266, + "step": 1424, + "time_per_iteration": 2.791482448577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082662, + "balance_loss_mlp": 1.04921198, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.06956472940992567, + "language_loss": 0.8333236, + "learning_rate": 0.0008516759445033477, + "loss": 0.84415025, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.3347168, + "step": 1425, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082757, + "balance_loss_mlp": 1.05088091, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.0615305422895171, + "language_loss": 0.84459686, + "learning_rate": 0.0008514544189427526, + "loss": 0.85542446, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.31860352, + "step": 1426, + "time_per_iteration": 2.797384738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_mlp": 1.06143463, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061840511174045036, + "language_loss": 0.86558306, + "learning_rate": 0.0008512327569354511, + "loss": 0.87652624, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.32885742, + "step": 1427, + "time_per_iteration": 2.533623695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06418157, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.06551541099381472, + "language_loss": 0.83328068, + "learning_rate": 0.0008510109585675001, + "loss": 0.84424412, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.3215332, + "step": 1428, + "time_per_iteration": 2.623915672302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10653293, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.06717437310459566, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82279044, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.19140625, + "step": 1429, + "time_per_iteration": 4.737167596817017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096832, + "balance_loss_mlp": 1.06517005, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.06718416370196487, + "language_loss": 0.80457842, + "learning_rate": 0.0008505669530941415, + "loss": 0.81554675, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.31640625, + "step": 1430, + "time_per_iteration": 3.380617141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_mlp": 1.07169294, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.06498994038544256, + "language_loss": 0.83560073, + "learning_rate": 0.000850344746161112, + "loss": 0.8466357, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.31787109, + "step": 1431, + "time_per_iteration": 2.5917775630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_mlp": 1.06883883, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.06649249705457211, + "language_loss": 0.87664711, + "learning_rate": 0.0008501224032121894, + "loss": 0.88765645, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.32080078, + "step": 1432, + "time_per_iteration": 2.493826150894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_mlp": 1.06906962, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.06530156063230687, + "language_loss": 0.8172394, + "learning_rate": 0.0008498999243336946, + "loss": 0.82825768, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.32763672, + "step": 1433, + "time_per_iteration": 2.625955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_mlp": 1.07275844, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.056445052388478564, + "language_loss": 0.87110436, + "learning_rate": 0.0008496773096120021, + "loss": 0.88214689, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.31469727, + "step": 1434, + "time_per_iteration": 2.8644402027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093048, + "balance_loss_mlp": 1.06169593, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.07767765628739494, + "language_loss": 0.84306771, + "learning_rate": 0.0008494545591335381, + "loss": 0.85399818, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.31323242, + "step": 1435, + "time_per_iteration": 2.9069130420684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094657, + "balance_loss_mlp": 1.06366265, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04344696113506711, + "language_loss": 0.86938953, + "learning_rate": 0.0008492316729847823, + "loss": 0.88033605, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.30957031, + "step": 1436, + "time_per_iteration": 2.844926595687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091812, + "balance_loss_mlp": 1.06050754, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055139322891005815, + "language_loss": 0.79749823, + "learning_rate": 0.0008490086512522664, + "loss": 0.80841637, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.31274414, + "step": 1437, + "time_per_iteration": 2.722158670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092682, + "balance_loss_mlp": 1.06121063, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.06334111858493886, + "language_loss": 0.90728873, + "learning_rate": 0.0008487854940225755, + "loss": 0.91821557, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.31445312, + "step": 1438, + "time_per_iteration": 2.43622088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.05991077, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.05907133214000555, + "language_loss": 0.89962572, + "learning_rate": 0.0008485622013823466, + "loss": 0.91054124, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.31616211, + "step": 1439, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093806, + "balance_loss_mlp": 1.06154847, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.06492331678063241, + "language_loss": 0.82635379, + "learning_rate": 0.00084833877341827, + "loss": 0.83729184, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.32250977, + "step": 1440, + "time_per_iteration": 2.625870704650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092721, + "balance_loss_mlp": 1.06139278, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.06674971698169922, + "language_loss": 0.80478823, + "learning_rate": 0.000848115210217088, + "loss": 0.81571543, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.31298828, + "step": 1441, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086558, + "balance_loss_mlp": 1.05410933, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.055312199129178424, + "language_loss": 0.81684244, + "learning_rate": 0.0008478915118655952, + "loss": 0.82770801, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.32446289, + "step": 1442, + "time_per_iteration": 2.714303493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089692, + "balance_loss_mlp": 1.05710077, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.049794988647852687, + "language_loss": 0.86386287, + "learning_rate": 0.0008476676784506393, + "loss": 0.87475979, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.32592773, + "step": 1443, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_mlp": 1.05664372, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.05900532389488003, + "language_loss": 0.82031631, + "learning_rate": 0.0008474437100591201, + "loss": 0.83119631, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.31323242, + "step": 1444, + "time_per_iteration": 3.3359997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084677, + "balance_loss_mlp": 1.05160809, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.054436577911169556, + "language_loss": 0.85231566, + "learning_rate": 0.0008472196067779898, + "loss": 0.86316246, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.33081055, + "step": 1445, + "time_per_iteration": 2.7946455478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080884, + "balance_loss_mlp": 1.04850721, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.08667298623079295, + "language_loss": 0.85239732, + "learning_rate": 0.0008469953686942531, + "loss": 0.86320615, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.32373047, + "step": 1446, + "time_per_iteration": 3.0761613845825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.04927349, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.07591437330096602, + "language_loss": 0.8283245, + "learning_rate": 0.0008467709958949668, + "loss": 0.83914101, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.32373047, + "step": 1447, + "time_per_iteration": 2.7922093868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.0504328, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.0636917665663464, + "language_loss": 0.86192262, + "learning_rate": 0.0008465464884672403, + "loss": 0.8727442, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.31713867, + "step": 1448, + "time_per_iteration": 2.679574966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_mlp": 1.05211091, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06494062959974968, + "language_loss": 0.85664314, + "learning_rate": 0.0008463218464982348, + "loss": 0.86748445, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.32006836, + "step": 1449, + "time_per_iteration": 2.8746044635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05524611, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.05859002353759583, + "language_loss": 0.87554371, + "learning_rate": 0.0008460970700751645, + "loss": 0.88640976, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.31323242, + "step": 1450, + "time_per_iteration": 3.0630292892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.05447531, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06644970008868617, + "language_loss": 0.8732717, + "learning_rate": 0.000845872159285295, + "loss": 0.8841247, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.30786133, + "step": 1451, + "time_per_iteration": 2.7334539890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149095, + "balance_loss_mlp": 1.13173842, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.04059568749878616, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78915942, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17382812, + "step": 1452, + "time_per_iteration": 4.913143634796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087672, + "balance_loss_mlp": 1.05617714, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05755695164820471, + "language_loss": 0.86085773, + "learning_rate": 0.0008454219349544836, + "loss": 0.87173438, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.31469727, + "step": 1453, + "time_per_iteration": 3.3649299144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086718, + "balance_loss_mlp": 1.05569983, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.059728326526783365, + "language_loss": 0.8137995, + "learning_rate": 0.000845196621588334, + "loss": 0.82466674, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.30981445, + "step": 1454, + "time_per_iteration": 2.7774734497070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_mlp": 1.05095196, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.0559695634724148, + "language_loss": 0.76184201, + "learning_rate": 0.0008449711742049706, + "loss": 0.77266252, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.31054688, + "step": 1455, + "time_per_iteration": 2.75393009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_mlp": 1.04814696, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.06397369460964857, + "language_loss": 0.83309555, + "learning_rate": 0.0008447455928919196, + "loss": 0.84389246, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.31518555, + "step": 1456, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082481, + "balance_loss_mlp": 1.05177259, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.06274060179370718, + "language_loss": 0.86886203, + "learning_rate": 0.0008445198777367595, + "loss": 0.87968683, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.30664062, + "step": 1457, + "time_per_iteration": 2.6488282680511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089589, + "balance_loss_mlp": 1.05883336, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.06557026121847803, + "language_loss": 0.8106361, + "learning_rate": 0.0008442940288271208, + "loss": 0.82153201, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.30712891, + "step": 1458, + "time_per_iteration": 2.67258882522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096326, + "balance_loss_mlp": 1.06454456, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.07361561415976156, + "language_loss": 0.86939961, + "learning_rate": 0.0008440680462506856, + "loss": 0.88036287, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.31762695, + "step": 1459, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_mlp": 1.07354569, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.05419081251366802, + "language_loss": 0.86197531, + "learning_rate": 0.0008438419300951883, + "loss": 0.87302566, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.31469727, + "step": 1460, + "time_per_iteration": 2.6306796073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_mlp": 1.07459426, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.08520166677325354, + "language_loss": 0.8634038, + "learning_rate": 0.0008436156804484148, + "loss": 0.87446761, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.31762695, + "step": 1461, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.0698266, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.06649626079325978, + "language_loss": 0.88025403, + "learning_rate": 0.0008433892973982031, + "loss": 0.89127588, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.32348633, + "step": 1462, + "time_per_iteration": 2.572810173034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110576, + "balance_loss_mlp": 1.07333505, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06397092621415032, + "language_loss": 0.85030043, + "learning_rate": 0.0008431627810324431, + "loss": 0.86135799, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.32421875, + "step": 1463, + "time_per_iteration": 2.6855740547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_mlp": 1.0774579, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.06457367310459801, + "language_loss": 0.81006026, + "learning_rate": 0.000842936131439076, + "loss": 0.82115412, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.3190918, + "step": 1464, + "time_per_iteration": 2.5868756771087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_mlp": 1.07188725, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06483114531916107, + "language_loss": 0.87564301, + "learning_rate": 0.0008427093487060951, + "loss": 0.88666582, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.3034668, + "step": 1465, + "time_per_iteration": 2.6775078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.07294393, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05163652452488039, + "language_loss": 0.84608126, + "learning_rate": 0.000842482432921545, + "loss": 0.85712349, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.3125, + "step": 1466, + "time_per_iteration": 2.844379186630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090816, + "balance_loss_mlp": 1.05955911, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.05726454257462379, + "language_loss": 0.86823475, + "learning_rate": 0.0008422553841735225, + "loss": 0.87914288, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.31225586, + "step": 1467, + "time_per_iteration": 2.4838902950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05624461, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.07863392491108157, + "language_loss": 0.8442952, + "learning_rate": 0.0008420282025501757, + "loss": 0.85516858, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.31054688, + "step": 1468, + "time_per_iteration": 2.7528913021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108248, + "balance_loss_mlp": 1.05169988, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.056003117579575636, + "language_loss": 0.852718, + "learning_rate": 0.0008418008881397043, + "loss": 0.86354285, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.30737305, + "step": 1469, + "time_per_iteration": 2.6801319122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078886, + "balance_loss_mlp": 1.0479157, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.04937894089719141, + "language_loss": 0.82587177, + "learning_rate": 0.0008415734410303595, + "loss": 0.83666062, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.30932617, + "step": 1470, + "time_per_iteration": 3.1880481243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04551327, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.053571151454841835, + "language_loss": 0.90790403, + "learning_rate": 0.0008413458613104444, + "loss": 0.91866791, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.30834961, + "step": 1471, + "time_per_iteration": 2.6801347732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.04832768, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.054274543729309115, + "language_loss": 0.82964969, + "learning_rate": 0.0008411181490683129, + "loss": 0.84044528, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.31201172, + "step": 1472, + "time_per_iteration": 2.732304096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107702, + "balance_loss_mlp": 1.04619205, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05901735675502878, + "language_loss": 0.82318664, + "learning_rate": 0.0008408903043923707, + "loss": 0.83395684, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.30786133, + "step": 1473, + "time_per_iteration": 3.0503528118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04906487, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.06313039437285956, + "language_loss": 0.81015414, + "learning_rate": 0.0008406623273710754, + "loss": 0.82095402, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.30883789, + "step": 1474, + "time_per_iteration": 2.606189727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05008459, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06295911479055617, + "language_loss": 0.82597101, + "learning_rate": 0.0008404342180929351, + "loss": 0.83678609, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.31396484, + "step": 1475, + "time_per_iteration": 2.620607614517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_mlp": 1.04222226, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06425181584365489, + "language_loss": 0.81938702, + "learning_rate": 0.00084020597664651, + "loss": 0.83012277, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.31323242, + "step": 1476, + "time_per_iteration": 2.7725043296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_mlp": 1.05232406, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.06074887859321084, + "language_loss": 0.83907133, + "learning_rate": 0.0008399776031204111, + "loss": 0.84990764, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.31274414, + "step": 1477, + "time_per_iteration": 2.7300467491149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092258, + "balance_loss_mlp": 1.06081057, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.05838491012274946, + "language_loss": 0.80185568, + "learning_rate": 0.0008397490976033009, + "loss": 0.81277823, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.31420898, + "step": 1478, + "time_per_iteration": 2.650667905807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080543, + "balance_loss_mlp": 1.062042, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.03640521186287318, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78960192, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.18457031, + "step": 1479, + "time_per_iteration": 4.764774322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07654858, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.05702144306517339, + "language_loss": 0.85150903, + "learning_rate": 0.0008392916909509525, + "loss": 0.86259496, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.3203125, + "step": 1480, + "time_per_iteration": 3.0437960624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_mlp": 1.07289815, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.06780557774925215, + "language_loss": 0.84802043, + "learning_rate": 0.0008390627899932954, + "loss": 0.85906273, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.31298828, + "step": 1481, + "time_per_iteration": 2.596781015396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_mlp": 1.0693903, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.07875184362779108, + "language_loss": 0.88996881, + "learning_rate": 0.000838833757399789, + "loss": 0.90097642, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.31347656, + "step": 1482, + "time_per_iteration": 2.94795560836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.05274367, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.07597770471398792, + "language_loss": 0.80484587, + "learning_rate": 0.0008386045932593515, + "loss": 0.81568611, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.3125, + "step": 1483, + "time_per_iteration": 2.6795289516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079675, + "balance_loss_mlp": 1.0484184, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.05859914190414705, + "language_loss": 0.86136287, + "learning_rate": 0.0008383752976609525, + "loss": 0.8721596, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.31225586, + "step": 1484, + "time_per_iteration": 2.900468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_mlp": 1.04878783, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.0559282187978278, + "language_loss": 0.80215633, + "learning_rate": 0.0008381458706936123, + "loss": 0.81296104, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.31665039, + "step": 1485, + "time_per_iteration": 2.6815216541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.05031872, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.06658109550051822, + "language_loss": 0.87213105, + "learning_rate": 0.0008379163124464025, + "loss": 0.88295019, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.31567383, + "step": 1486, + "time_per_iteration": 2.7246947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098145, + "balance_loss_mlp": 1.06572032, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.06266105362217729, + "language_loss": 0.76595891, + "learning_rate": 0.0008376866230084452, + "loss": 0.77694035, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.32421875, + "step": 1487, + "time_per_iteration": 2.8626444339752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_mlp": 1.07006407, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.07368717199594518, + "language_loss": 0.86109662, + "learning_rate": 0.000837456802468914, + "loss": 0.87212193, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.32470703, + "step": 1488, + "time_per_iteration": 2.5964457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109506, + "balance_loss_mlp": 1.07736683, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.0834333673185767, + "language_loss": 0.85148358, + "learning_rate": 0.0008372268509170331, + "loss": 0.86257863, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.32128906, + "step": 1489, + "time_per_iteration": 2.690129518508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109667, + "balance_loss_mlp": 1.06500769, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.06354137393554884, + "language_loss": 0.84668255, + "learning_rate": 0.0008369967684420779, + "loss": 0.85764927, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.31640625, + "step": 1490, + "time_per_iteration": 2.71195912361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_mlp": 1.0523901, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.054809792311278624, + "language_loss": 0.84395373, + "learning_rate": 0.0008367665551333736, + "loss": 0.85479403, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.31616211, + "step": 1491, + "time_per_iteration": 2.604795217514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05223465, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.06594588712207736, + "language_loss": 0.85254663, + "learning_rate": 0.0008365362110802977, + "loss": 0.86338341, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.31420898, + "step": 1492, + "time_per_iteration": 2.8853299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_mlp": 1.05619645, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.057648204576232445, + "language_loss": 0.82509673, + "learning_rate": 0.0008363057363722773, + "loss": 0.83596557, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.30664062, + "step": 1493, + "time_per_iteration": 2.8410117626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088416, + "balance_loss_mlp": 1.05916238, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.06315135639172008, + "language_loss": 0.8381595, + "learning_rate": 0.0008360751310987906, + "loss": 0.84904373, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.29199219, + "step": 1494, + "time_per_iteration": 2.6032519340515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_mlp": 1.05821633, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.0504042487563093, + "language_loss": 0.85491359, + "learning_rate": 0.0008358443953493666, + "loss": 0.865798, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.30175781, + "step": 1495, + "time_per_iteration": 2.859473943710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095118, + "balance_loss_mlp": 1.06586444, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.05765908021852543, + "language_loss": 0.87930727, + "learning_rate": 0.0008356135292135851, + "loss": 0.89025843, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.29223633, + "step": 1496, + "time_per_iteration": 2.5534088611602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092831, + "balance_loss_mlp": 1.06357718, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06886872222290924, + "language_loss": 0.91869086, + "learning_rate": 0.0008353825327810758, + "loss": 0.92961913, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.29223633, + "step": 1497, + "time_per_iteration": 2.4516804218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.0700376, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.06787386534843613, + "language_loss": 0.81638563, + "learning_rate": 0.00083515140614152, + "loss": 0.8273809, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.29467773, + "step": 1498, + "time_per_iteration": 2.6799356937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_mlp": 1.07136989, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.07094138317708479, + "language_loss": 0.861467, + "learning_rate": 0.0008349201493846485, + "loss": 0.87247133, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.2902832, + "step": 1499, + "time_per_iteration": 2.6408841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_mlp": 1.07190013, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.05864167405563355, + "language_loss": 0.88756049, + "learning_rate": 0.0008346887626002432, + "loss": 0.89857149, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.29174805, + "step": 1500, + "time_per_iteration": 2.527707099914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_mlp": 1.07277215, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.05528939811548228, + "language_loss": 0.8596012, + "learning_rate": 0.000834457245878137, + "loss": 0.87062287, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.29345703, + "step": 1501, + "time_per_iteration": 2.6287105083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097625, + "balance_loss_mlp": 1.0678941, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05829487367290223, + "language_loss": 0.81370407, + "learning_rate": 0.000834225599308212, + "loss": 0.82468033, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.296875, + "step": 1502, + "time_per_iteration": 3.2405459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097665, + "balance_loss_mlp": 1.06762409, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.0632270740356206, + "language_loss": 0.85299563, + "learning_rate": 0.0008339938229804016, + "loss": 0.86397231, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.30029297, + "step": 1503, + "time_per_iteration": 2.736917495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238462, + "balance_loss_mlp": 1.22091448, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.0713987899259734, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76673281, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17578125, + "step": 1504, + "time_per_iteration": 4.942230701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_mlp": 1.0553329, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06317842242163065, + "language_loss": 0.83872586, + "learning_rate": 0.0008335298814111094, + "loss": 0.84958482, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.30517578, + "step": 1505, + "time_per_iteration": 2.552032232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_mlp": 1.05138254, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.05888591645587949, + "language_loss": 0.87955916, + "learning_rate": 0.0008332977163497455, + "loss": 0.89038765, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.31445312, + "step": 1506, + "time_per_iteration": 2.792531728744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080802, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.058262801056698586, + "language_loss": 0.83412617, + "learning_rate": 0.0008330654218907325, + "loss": 0.84493423, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.31616211, + "step": 1507, + "time_per_iteration": 2.67161226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082791, + "balance_loss_mlp": 1.05151033, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.053562219876337476, + "language_loss": 0.8135345, + "learning_rate": 0.0008328329981242548, + "loss": 0.8243624, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3125, + "step": 1508, + "time_per_iteration": 2.8886146545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082272, + "balance_loss_mlp": 1.05006218, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.059525688681207785, + "language_loss": 0.87796283, + "learning_rate": 0.0008326004451405475, + "loss": 0.88878554, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.32202148, + "step": 1509, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081166, + "balance_loss_mlp": 1.04919386, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.06566805569484924, + "language_loss": 0.82636976, + "learning_rate": 0.0008323677630298957, + "loss": 0.83718145, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.31958008, + "step": 1510, + "time_per_iteration": 2.5723018646240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.0500108, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.0587639353811087, + "language_loss": 0.84588593, + "learning_rate": 0.0008321349518826345, + "loss": 0.85671222, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.32617188, + "step": 1511, + "time_per_iteration": 2.7943453788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085904, + "balance_loss_mlp": 1.05417013, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07149106056529789, + "language_loss": 0.94572604, + "learning_rate": 0.0008319020117891491, + "loss": 0.95658505, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.31713867, + "step": 1512, + "time_per_iteration": 2.6216046810150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_mlp": 1.05095613, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.062137158428294176, + "language_loss": 0.87139338, + "learning_rate": 0.0008316689428398751, + "loss": 0.88222551, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.32250977, + "step": 1513, + "time_per_iteration": 2.7016332149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.05217493, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.048438835392173675, + "language_loss": 0.88380623, + "learning_rate": 0.0008314357451252979, + "loss": 0.89463598, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.30761719, + "step": 1514, + "time_per_iteration": 2.7707033157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.05329311, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.17247024929444854, + "language_loss": 0.87881547, + "learning_rate": 0.0008312024187359527, + "loss": 0.88966405, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.31542969, + "step": 1515, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_mlp": 1.04083025, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.05532389066983382, + "language_loss": 0.86925149, + "learning_rate": 0.000830968963762425, + "loss": 0.8799662, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.3059082, + "step": 1516, + "time_per_iteration": 3.024911403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.03955793, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.06371457252332635, + "language_loss": 0.83926201, + "learning_rate": 0.0008307353802953497, + "loss": 0.84996927, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.3112793, + "step": 1517, + "time_per_iteration": 2.6853716373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072896, + "balance_loss_mlp": 1.04202044, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04882989118503786, + "language_loss": 0.86122108, + "learning_rate": 0.0008305016684254125, + "loss": 0.87195003, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.30859375, + "step": 1518, + "time_per_iteration": 2.799062728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_mlp": 1.04589891, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.06769299348115199, + "language_loss": 0.86794329, + "learning_rate": 0.0008302678282433479, + "loss": 0.87871796, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.31542969, + "step": 1519, + "time_per_iteration": 2.607813835144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.0473547, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.06836141022194388, + "language_loss": 0.84857148, + "learning_rate": 0.0008300338598399411, + "loss": 0.85936522, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.32006836, + "step": 1520, + "time_per_iteration": 2.6339783668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079776, + "balance_loss_mlp": 1.04677844, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.07756319993269217, + "language_loss": 0.94405806, + "learning_rate": 0.0008297997633060263, + "loss": 0.9548558, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.33007812, + "step": 1521, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_mlp": 1.03991103, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.05829817081366362, + "language_loss": 0.85078239, + "learning_rate": 0.0008295655387324883, + "loss": 0.86150956, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.328125, + "step": 1522, + "time_per_iteration": 2.8296775817871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072427, + "balance_loss_mlp": 1.04031241, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.07682732219120929, + "language_loss": 0.8501184, + "learning_rate": 0.0008293311862102609, + "loss": 0.8608427, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.32104492, + "step": 1523, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.044366, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0685602534850527, + "language_loss": 0.88674849, + "learning_rate": 0.0008290967058303275, + "loss": 0.89752042, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.32836914, + "step": 1524, + "time_per_iteration": 2.47611403465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04138136, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.06274350285183052, + "language_loss": 0.86149156, + "learning_rate": 0.0008288620976837219, + "loss": 0.87222481, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.31933594, + "step": 1525, + "time_per_iteration": 2.497141122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076595, + "balance_loss_mlp": 1.04409802, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.056882926132582716, + "language_loss": 0.82547259, + "learning_rate": 0.000828627361861527, + "loss": 0.8362385, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.32495117, + "step": 1526, + "time_per_iteration": 2.567631959915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.04157782, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.06286177552115993, + "language_loss": 0.84273493, + "learning_rate": 0.0008283924984548752, + "loss": 0.85347635, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.32568359, + "step": 1527, + "time_per_iteration": 2.8300318717956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_mlp": 1.04270601, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05246647038375997, + "language_loss": 0.84726572, + "learning_rate": 0.0008281575075549485, + "loss": 0.85802233, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.32958984, + "step": 1528, + "time_per_iteration": 2.574363946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_mlp": 1.12400758, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.05743835109314035, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78497207, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.20507812, + "step": 1529, + "time_per_iteration": 4.712693452835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085379, + "balance_loss_mlp": 1.05316901, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06778682509264199, + "language_loss": 0.90275097, + "learning_rate": 0.0008276871436402469, + "loss": 0.9136048, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.32202148, + "step": 1530, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098938, + "balance_loss_mlp": 1.06801534, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05712547612295055, + "language_loss": 0.87684029, + "learning_rate": 0.000827451770808083, + "loss": 0.88782966, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.30908203, + "step": 1531, + "time_per_iteration": 2.6601221561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_mlp": 1.06921971, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.06660356736231628, + "language_loss": 0.82939392, + "learning_rate": 0.0008272162708478674, + "loss": 0.84040606, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.31982422, + "step": 1532, + "time_per_iteration": 2.5689916610717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093792, + "balance_loss_mlp": 1.06234503, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.09954158315547566, + "language_loss": 0.86026615, + "learning_rate": 0.000826980643851029, + "loss": 0.87120402, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.31420898, + "step": 1533, + "time_per_iteration": 2.668490409851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_mlp": 1.06560588, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06068587162994625, + "language_loss": 0.84473491, + "learning_rate": 0.0008267448899090464, + "loss": 0.85570371, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.3125, + "step": 1534, + "time_per_iteration": 2.5667166709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_mlp": 1.08053756, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.07629507960375684, + "language_loss": 0.80660546, + "learning_rate": 0.0008265090091134473, + "loss": 0.81771713, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.3059082, + "step": 1535, + "time_per_iteration": 2.8708250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108767, + "balance_loss_mlp": 1.07793915, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.06117244877185189, + "language_loss": 0.80140841, + "learning_rate": 0.0008262730015558088, + "loss": 0.81249607, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.30786133, + "step": 1536, + "time_per_iteration": 2.872954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.06960511, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.058742702923310866, + "language_loss": 0.82196116, + "learning_rate": 0.0008260368673277574, + "loss": 0.8329612, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.3034668, + "step": 1537, + "time_per_iteration": 3.1321218013763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_mlp": 1.06963336, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.0781542924594719, + "language_loss": 0.83699298, + "learning_rate": 0.0008258006065209682, + "loss": 0.84798855, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.29882812, + "step": 1538, + "time_per_iteration": 2.7713711261749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108634, + "balance_loss_mlp": 1.0791415, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.060396297474130736, + "language_loss": 0.80198979, + "learning_rate": 0.0008255642192271657, + "loss": 0.81307614, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.29443359, + "step": 1539, + "time_per_iteration": 2.770426034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_mlp": 1.07525003, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.061957869610313854, + "language_loss": 0.8370012, + "learning_rate": 0.0008253277055381241, + "loss": 0.8480469, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.29296875, + "step": 1540, + "time_per_iteration": 2.818236827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101049, + "balance_loss_mlp": 1.07196212, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.0808235318545815, + "language_loss": 0.85973728, + "learning_rate": 0.0008250910655456658, + "loss": 0.8707478, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.29052734, + "step": 1541, + "time_per_iteration": 3.122596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097236, + "balance_loss_mlp": 1.06888783, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06915250684599016, + "language_loss": 0.83763367, + "learning_rate": 0.0008248542993416625, + "loss": 0.84860599, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.28369141, + "step": 1542, + "time_per_iteration": 2.5910961627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093651, + "balance_loss_mlp": 1.06408739, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.05605218699384054, + "language_loss": 0.8378318, + "learning_rate": 0.0008246174070180352, + "loss": 0.84876835, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.29516602, + "step": 1543, + "time_per_iteration": 2.6633899211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.06312323, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.07006000939384768, + "language_loss": 0.83787405, + "learning_rate": 0.0008243803886667537, + "loss": 0.84879309, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.28759766, + "step": 1544, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092222, + "balance_loss_mlp": 1.0623486, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.06063612617340172, + "language_loss": 0.78866625, + "learning_rate": 0.0008241432443798364, + "loss": 0.79958844, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.2980957, + "step": 1545, + "time_per_iteration": 2.830487012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095453, + "balance_loss_mlp": 1.06491208, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05072672460675934, + "language_loss": 0.85210156, + "learning_rate": 0.0008239059742493512, + "loss": 0.86305606, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.30493164, + "step": 1546, + "time_per_iteration": 2.7311577796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096869, + "balance_loss_mlp": 1.06654167, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.06216195389248957, + "language_loss": 0.87149853, + "learning_rate": 0.0008236685783674142, + "loss": 0.88246721, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.30273438, + "step": 1547, + "time_per_iteration": 3.122184991836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195158, + "balance_loss_mlp": 1.17408168, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.0711099730375168, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77416348, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.2109375, + "step": 1548, + "time_per_iteration": 4.884527683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_mlp": 1.08190823, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.0721948840315393, + "language_loss": 0.82155961, + "learning_rate": 0.0008231934097178955, + "loss": 0.83268768, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.30859375, + "step": 1549, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_mlp": 1.06845081, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.06744191732210313, + "language_loss": 0.85654205, + "learning_rate": 0.0008229556371347903, + "loss": 0.86754102, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.31420898, + "step": 1550, + "time_per_iteration": 2.973072052001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_mlp": 1.06530416, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.063776129703287, + "language_loss": 0.79039407, + "learning_rate": 0.0008227177391691874, + "loss": 0.80135703, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.30957031, + "step": 1551, + "time_per_iteration": 3.121493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091, + "balance_loss_mlp": 1.05948138, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.06994546641795159, + "language_loss": 0.89363164, + "learning_rate": 0.0008224797159134463, + "loss": 0.90454161, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.31494141, + "step": 1552, + "time_per_iteration": 2.714345932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_mlp": 1.05272293, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.0687696840960861, + "language_loss": 0.83498526, + "learning_rate": 0.0008222415674599765, + "loss": 0.84583527, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.32275391, + "step": 1553, + "time_per_iteration": 3.0709471702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_mlp": 1.05482578, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05942841135237563, + "language_loss": 0.83069479, + "learning_rate": 0.0008220032939012349, + "loss": 0.84156853, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.32543945, + "step": 1554, + "time_per_iteration": 2.6579041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084574, + "balance_loss_mlp": 1.05069458, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.05066559322117623, + "language_loss": 0.87862611, + "learning_rate": 0.0008217648953297277, + "loss": 0.88947189, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.33886719, + "step": 1555, + "time_per_iteration": 2.854501962661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080889, + "balance_loss_mlp": 1.04836845, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06306800858294438, + "language_loss": 0.78177649, + "learning_rate": 0.0008215263718380095, + "loss": 0.79258537, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.32519531, + "step": 1556, + "time_per_iteration": 2.679813861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_mlp": 1.03988135, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.05857921257987888, + "language_loss": 0.84453404, + "learning_rate": 0.0008212877235186833, + "loss": 0.8552593, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.32641602, + "step": 1557, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.0575211, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03849586533955073, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812063, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.16992188, + "step": 1558, + "time_per_iteration": 4.915595531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073624, + "balance_loss_mlp": 1.04193807, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06731849387550101, + "language_loss": 0.80882478, + "learning_rate": 0.0008208100527678611, + "loss": 0.81956106, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.31665039, + "step": 1559, + "time_per_iteration": 2.584726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04162097, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.07382200765663921, + "language_loss": 0.78279877, + "learning_rate": 0.0008205710305218135, + "loss": 0.79353946, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.32446289, + "step": 1560, + "time_per_iteration": 3.0383710861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074163, + "balance_loss_mlp": 1.04302561, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.058207727477831525, + "language_loss": 0.89512408, + "learning_rate": 0.0008203318838190541, + "loss": 0.90586567, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.31103516, + "step": 1561, + "time_per_iteration": 2.76627516746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077695, + "balance_loss_mlp": 1.04662895, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.06168132254821995, + "language_loss": 0.85111785, + "learning_rate": 0.0008200926127524281, + "loss": 0.86189479, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.31030273, + "step": 1562, + "time_per_iteration": 2.6629600524902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_mlp": 1.04641104, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.05613480590592382, + "language_loss": 0.82944739, + "learning_rate": 0.0008198532174148289, + "loss": 0.84022236, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.31054688, + "step": 1563, + "time_per_iteration": 2.7358763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_mlp": 1.042413, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.031593282863211954, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81745368, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.16796875, + "step": 1564, + "time_per_iteration": 4.9148335456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082495, + "balance_loss_mlp": 1.05264509, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.06408713771925002, + "language_loss": 0.88499033, + "learning_rate": 0.0008193740542985244, + "loss": 0.89581525, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.2980957, + "step": 1565, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.04955089, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.05458149708053591, + "language_loss": 0.86310005, + "learning_rate": 0.0008191342867058467, + "loss": 0.87388408, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.28833008, + "step": 1566, + "time_per_iteration": 2.7972991466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.05708098, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.07332398387540356, + "language_loss": 0.8337127, + "learning_rate": 0.0008188943952142509, + "loss": 0.84458339, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.29931641, + "step": 1567, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_mlp": 1.06203008, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.06528974392408285, + "language_loss": 0.82496703, + "learning_rate": 0.0008186543799168711, + "loss": 0.83587217, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.28491211, + "step": 1568, + "time_per_iteration": 3.1478142738342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090151, + "balance_loss_mlp": 1.06170726, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.05489125757590388, + "language_loss": 0.87973905, + "learning_rate": 0.0008184142409068892, + "loss": 0.89064056, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.28466797, + "step": 1569, + "time_per_iteration": 3.0216779708862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085926, + "balance_loss_mlp": 1.05767381, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.055531787765466835, + "language_loss": 0.86334872, + "learning_rate": 0.000818173978277536, + "loss": 0.87420803, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.2824707, + "step": 1570, + "time_per_iteration": 2.679858922958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092107, + "balance_loss_mlp": 1.06378245, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.07890485552513911, + "language_loss": 0.83764422, + "learning_rate": 0.000817933592122089, + "loss": 0.84856522, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.28344727, + "step": 1571, + "time_per_iteration": 2.7156453132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097909, + "balance_loss_mlp": 1.06936991, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.06172775968750255, + "language_loss": 0.83209121, + "learning_rate": 0.0008176930825338749, + "loss": 0.84307027, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.28564453, + "step": 1572, + "time_per_iteration": 2.6125760078430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092858, + "balance_loss_mlp": 1.06474876, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.07609523017386281, + "language_loss": 0.88406599, + "learning_rate": 0.0008174524496062679, + "loss": 0.8949945, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.28100586, + "step": 1573, + "time_per_iteration": 2.9266738891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093192, + "balance_loss_mlp": 1.06472516, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.061281594343297996, + "language_loss": 0.85176635, + "learning_rate": 0.0008172116934326894, + "loss": 0.86269826, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.28466797, + "step": 1574, + "time_per_iteration": 2.78182315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093702, + "balance_loss_mlp": 1.06499696, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.061003462460527645, + "language_loss": 0.87581599, + "learning_rate": 0.0008169708141066097, + "loss": 0.88675308, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.28686523, + "step": 1575, + "time_per_iteration": 2.579521894454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095615, + "balance_loss_mlp": 1.06631374, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06494361929352876, + "language_loss": 0.90285015, + "learning_rate": 0.0008167298117215465, + "loss": 0.91380632, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.29272461, + "step": 1576, + "time_per_iteration": 2.576373815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109664, + "balance_loss_mlp": 1.06729078, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06029453435911351, + "language_loss": 0.87511861, + "learning_rate": 0.0008164886863710649, + "loss": 0.88608503, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.29296875, + "step": 1577, + "time_per_iteration": 2.913679599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06847095, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.06219192746352704, + "language_loss": 0.86087388, + "learning_rate": 0.0008162474381487783, + "loss": 0.87184995, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.29101562, + "step": 1578, + "time_per_iteration": 3.0120038986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089575, + "balance_loss_mlp": 1.05979693, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.07133259007734825, + "language_loss": 0.84352636, + "learning_rate": 0.0008160060671483475, + "loss": 0.85442215, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.29711914, + "step": 1579, + "time_per_iteration": 2.6448450088500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087505, + "balance_loss_mlp": 1.05729711, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.06969729270721756, + "language_loss": 0.83291966, + "learning_rate": 0.0008157645734634809, + "loss": 0.8437947, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.30200195, + "step": 1580, + "time_per_iteration": 2.623994827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219684, + "balance_loss_mlp": 1.20118308, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.06785469110901753, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78116179, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.18457031, + "step": 1581, + "time_per_iteration": 4.945984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134498, + "balance_loss_mlp": 1.11723626, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.04727039603147748, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74348998, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17285156, + "step": 1582, + "time_per_iteration": 4.907581567764282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094198, + "balance_loss_mlp": 1.06482506, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.06103997784231323, + "language_loss": 0.83613545, + "learning_rate": 0.000815039357240067, + "loss": 0.84707743, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.29345703, + "step": 1583, + "time_per_iteration": 2.6569504737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_mlp": 1.07053173, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.05926881191118497, + "language_loss": 0.85445809, + "learning_rate": 0.0008147973737554952, + "loss": 0.86544669, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.28344727, + "step": 1584, + "time_per_iteration": 2.8048319816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105359, + "balance_loss_mlp": 1.07682085, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.06192456547731419, + "language_loss": 0.85451925, + "learning_rate": 0.000814555268055744, + "loss": 0.86557281, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.28540039, + "step": 1585, + "time_per_iteration": 2.6496644020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.08265996, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.06812003210241727, + "language_loss": 0.87046736, + "learning_rate": 0.0008143130402348073, + "loss": 0.88158417, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.28979492, + "step": 1586, + "time_per_iteration": 2.6643214225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_mlp": 1.07644498, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.055468457342214825, + "language_loss": 0.79345113, + "learning_rate": 0.0008140706903867265, + "loss": 0.80450928, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.29345703, + "step": 1587, + "time_per_iteration": 2.793938159942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095768, + "balance_loss_mlp": 1.06610858, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.06572122415162869, + "language_loss": 0.90151691, + "learning_rate": 0.0008138282186055897, + "loss": 0.91247463, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.29614258, + "step": 1588, + "time_per_iteration": 2.7083215713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.06414866, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.07456080522357873, + "language_loss": 0.82026887, + "learning_rate": 0.0008135856249855331, + "loss": 0.83120513, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.29467773, + "step": 1589, + "time_per_iteration": 2.6640753746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05720115, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06169186885540492, + "language_loss": 0.89804673, + "learning_rate": 0.0008133429096207398, + "loss": 0.90891039, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.29125977, + "step": 1590, + "time_per_iteration": 2.7599587440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180768, + "balance_loss_mlp": 1.16407835, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.058161185258212886, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76493025, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.16699219, + "step": 1591, + "time_per_iteration": 4.928807973861694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092058, + "balance_loss_mlp": 1.06149244, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05378358074526122, + "language_loss": 0.86363673, + "learning_rate": 0.0008128571140339123, + "loss": 0.87455726, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.30517578, + "step": 1592, + "time_per_iteration": 2.6374073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.06182945, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.059608258439458016, + "language_loss": 0.87261879, + "learning_rate": 0.0008126140340004805, + "loss": 0.88355112, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.3137207, + "step": 1593, + "time_per_iteration": 2.5177900791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_mlp": 1.07528496, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.05384575425533411, + "language_loss": 0.82083076, + "learning_rate": 0.0008123708325995172, + "loss": 0.83190024, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.31640625, + "step": 1594, + "time_per_iteration": 3.230646848678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_mlp": 1.07466626, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.05828956025392548, + "language_loss": 0.79435146, + "learning_rate": 0.0008121275099254414, + "loss": 0.80541706, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.31884766, + "step": 1595, + "time_per_iteration": 2.902198553085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_mlp": 1.07000458, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.0810481792888773, + "language_loss": 0.87996, + "learning_rate": 0.0008118840660727194, + "loss": 0.89096785, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.30761719, + "step": 1596, + "time_per_iteration": 2.6448442935943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_mlp": 1.05465174, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.06221817840069264, + "language_loss": 0.87278962, + "learning_rate": 0.0008116405011358644, + "loss": 0.88365012, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.3137207, + "step": 1597, + "time_per_iteration": 3.1513490676879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_mlp": 1.05455184, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05780846158028219, + "language_loss": 0.79670262, + "learning_rate": 0.0008113968152094369, + "loss": 0.80755049, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.30175781, + "step": 1598, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_mlp": 1.05160582, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.05742950260468591, + "language_loss": 0.822034, + "learning_rate": 0.0008111530083880438, + "loss": 0.83285123, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.30078125, + "step": 1599, + "time_per_iteration": 2.9002020359039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.05333805, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.066825138462863, + "language_loss": 0.86253393, + "learning_rate": 0.0008109090807663399, + "loss": 0.87336552, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.29760742, + "step": 1600, + "time_per_iteration": 2.8091297149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078593, + "balance_loss_mlp": 1.04921985, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.05248494232095894, + "language_loss": 0.88362008, + "learning_rate": 0.0008106650324390257, + "loss": 0.89440602, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.29370117, + "step": 1601, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080904, + "balance_loss_mlp": 1.05072021, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06836714374526962, + "language_loss": 0.81128752, + "learning_rate": 0.0008104208635008493, + "loss": 0.82209659, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.30151367, + "step": 1602, + "time_per_iteration": 2.6952836513519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_mlp": 1.05665243, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.06376665529861299, + "language_loss": 0.81538713, + "learning_rate": 0.0008101765740466058, + "loss": 0.82624954, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.29541016, + "step": 1603, + "time_per_iteration": 2.4948389530181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080977, + "balance_loss_mlp": 1.05098414, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.06931980864978393, + "language_loss": 0.84338289, + "learning_rate": 0.0008099321641711364, + "loss": 0.85419261, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.29931641, + "step": 1604, + "time_per_iteration": 2.707308769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093892, + "balance_loss_mlp": 1.06249225, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.060864651717696075, + "language_loss": 0.83160985, + "learning_rate": 0.0008096876339693295, + "loss": 0.84254879, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.3137207, + "step": 1605, + "time_per_iteration": 2.731968402862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094701, + "balance_loss_mlp": 1.06353974, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.06509347225319946, + "language_loss": 0.8101337, + "learning_rate": 0.0008094429835361206, + "loss": 0.8210808, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.3112793, + "step": 1606, + "time_per_iteration": 2.9290759563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05914617, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.057098253953708926, + "language_loss": 0.8565855, + "learning_rate": 0.0008091982129664908, + "loss": 0.86748546, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.30810547, + "step": 1607, + "time_per_iteration": 2.698822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087412, + "balance_loss_mlp": 1.05558348, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.06809183454795278, + "language_loss": 0.82921505, + "learning_rate": 0.0008089533223554687, + "loss": 0.8400892, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.31811523, + "step": 1608, + "time_per_iteration": 2.7226502895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.05116844, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05457453553086006, + "language_loss": 0.85192972, + "learning_rate": 0.0008087083117981294, + "loss": 0.86274683, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.30493164, + "step": 1609, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_mlp": 1.04733825, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.05682891267097286, + "language_loss": 0.87723553, + "learning_rate": 0.0008084631813895943, + "loss": 0.88802552, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.31665039, + "step": 1610, + "time_per_iteration": 2.8217973709106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077424, + "balance_loss_mlp": 1.04538095, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.06653230383850259, + "language_loss": 0.83695799, + "learning_rate": 0.0008082179312250315, + "loss": 0.84773219, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.3203125, + "step": 1611, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.13905036, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.03907624866068961, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81013775, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18847656, + "step": 1612, + "time_per_iteration": 4.846347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142611, + "balance_loss_mlp": 1.12401426, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.03590336133433786, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77771938, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.18554688, + "step": 1613, + "time_per_iteration": 5.076608896255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_mlp": 1.05432057, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06574200684353006, + "language_loss": 0.81847739, + "learning_rate": 0.0008074814631475545, + "loss": 0.829337, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.31616211, + "step": 1614, + "time_per_iteration": 3.354888916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_mlp": 1.05552983, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.058665683967318874, + "language_loss": 0.79078931, + "learning_rate": 0.0008072357349114907, + "loss": 0.80165768, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.31274414, + "step": 1615, + "time_per_iteration": 2.66959810256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085653, + "balance_loss_mlp": 1.05427742, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.07028059658598983, + "language_loss": 0.88604105, + "learning_rate": 0.0008069898873959363, + "loss": 0.89689755, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.31347656, + "step": 1616, + "time_per_iteration": 2.652873992919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081821, + "balance_loss_mlp": 1.04932451, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.0549356144381418, + "language_loss": 0.85724425, + "learning_rate": 0.0008067439206963375, + "loss": 0.86806244, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32495117, + "step": 1617, + "time_per_iteration": 2.651966094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_mlp": 1.04707837, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06196009796144799, + "language_loss": 0.86023569, + "learning_rate": 0.0008064978349081873, + "loss": 0.87101597, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.30908203, + "step": 1618, + "time_per_iteration": 2.9655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_mlp": 1.04403007, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.05286958899784421, + "language_loss": 0.86531937, + "learning_rate": 0.0008062516301270245, + "loss": 0.87608671, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.32714844, + "step": 1619, + "time_per_iteration": 2.6688730716705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.04668832, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.04767982292239376, + "language_loss": 0.88103712, + "learning_rate": 0.0008060053064484343, + "loss": 0.89181346, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.30908203, + "step": 1620, + "time_per_iteration": 2.9296655654907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_mlp": 1.04794526, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.062218975842766755, + "language_loss": 0.85253787, + "learning_rate": 0.0008057588639680482, + "loss": 0.86332226, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.3046875, + "step": 1621, + "time_per_iteration": 2.7567451000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077048, + "balance_loss_mlp": 1.04686427, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06694670244497776, + "language_loss": 0.82797694, + "learning_rate": 0.0008055123027815434, + "loss": 0.83874738, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.30151367, + "step": 1622, + "time_per_iteration": 2.9208602905273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077079, + "balance_loss_mlp": 1.04610825, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.1782498685509151, + "language_loss": 0.84590065, + "learning_rate": 0.0008052656229846436, + "loss": 0.85667145, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.30932617, + "step": 1623, + "time_per_iteration": 2.7155866622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073968, + "balance_loss_mlp": 1.04328322, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.060959339396114136, + "language_loss": 0.90353578, + "learning_rate": 0.0008050188246731182, + "loss": 0.91427553, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.30664062, + "step": 1624, + "time_per_iteration": 2.6797330379486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076074, + "balance_loss_mlp": 1.04412627, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.055606567643031936, + "language_loss": 0.81689882, + "learning_rate": 0.0008047719079427834, + "loss": 0.82765961, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.31933594, + "step": 1625, + "time_per_iteration": 3.0065042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_mlp": 1.11031902, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.04475298972307083, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75482148, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.20117188, + "step": 1626, + "time_per_iteration": 4.811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_mlp": 1.04688525, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.07327685166102689, + "language_loss": 0.86126161, + "learning_rate": 0.0008042777196091757, + "loss": 0.87205535, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.32495117, + "step": 1627, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05241048, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.055253724304277024, + "language_loss": 0.81718934, + "learning_rate": 0.0008040304481977643, + "loss": 0.82803679, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.32324219, + "step": 1628, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.0556109, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.07469207399290811, + "language_loss": 0.86699098, + "learning_rate": 0.0008037830587512649, + "loss": 0.87787557, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.32861328, + "step": 1629, + "time_per_iteration": 3.092052459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_mlp": 1.0538609, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.05491200172004239, + "language_loss": 0.78946573, + "learning_rate": 0.0008035355513657224, + "loss": 0.80032265, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.31811523, + "step": 1630, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_mlp": 1.05111051, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.05139869194515267, + "language_loss": 0.92925692, + "learning_rate": 0.0008032879261372279, + "loss": 0.94008344, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.31518555, + "step": 1631, + "time_per_iteration": 2.779520034790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.05868566, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.031013784922197977, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80712551, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.18066406, + "step": 1632, + "time_per_iteration": 5.371822357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_mlp": 1.04828787, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.055553714952817974, + "language_loss": 0.87074977, + "learning_rate": 0.0008027923225359748, + "loss": 0.8815397, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.30688477, + "step": 1633, + "time_per_iteration": 2.6381123065948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_mlp": 1.04797852, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05859649155609266, + "language_loss": 0.88228178, + "learning_rate": 0.0008025443443556267, + "loss": 0.89307147, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.30957031, + "step": 1634, + "time_per_iteration": 2.7031404972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.04785156, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.052081770011180493, + "language_loss": 0.88152099, + "learning_rate": 0.000802296248717147, + "loss": 0.89230251, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.30273438, + "step": 1635, + "time_per_iteration": 2.9598543643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.05080533, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.066530556652877, + "language_loss": 0.78616363, + "learning_rate": 0.0008020480357168554, + "loss": 0.79697067, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.29833984, + "step": 1636, + "time_per_iteration": 2.797565221786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05261683, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.1046412191682548, + "language_loss": 0.87883365, + "learning_rate": 0.0008017997054511165, + "loss": 0.88965666, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.29638672, + "step": 1637, + "time_per_iteration": 2.559032440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078208, + "balance_loss_mlp": 1.04733276, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.05513941849331592, + "language_loss": 0.85624552, + "learning_rate": 0.0008015512580163407, + "loss": 0.86702752, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.30834961, + "step": 1638, + "time_per_iteration": 2.779050827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04363525, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.05557291013478606, + "language_loss": 0.81019449, + "learning_rate": 0.0008013026935089838, + "loss": 0.82094443, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.31323242, + "step": 1639, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04701638, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.06613944709877946, + "language_loss": 0.8358075, + "learning_rate": 0.0008010540120255472, + "loss": 0.84657711, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.29882812, + "step": 1640, + "time_per_iteration": 2.651386260986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077047, + "balance_loss_mlp": 1.0463388, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.07317243700129339, + "language_loss": 0.86339968, + "learning_rate": 0.0008008052136625774, + "loss": 0.87417012, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.30688477, + "step": 1641, + "time_per_iteration": 2.7859702110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077184, + "balance_loss_mlp": 1.04642797, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05078324108170858, + "language_loss": 0.86915755, + "learning_rate": 0.0008005562985166666, + "loss": 0.87992936, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.30712891, + "step": 1642, + "time_per_iteration": 2.770359516143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04775047, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.048579646337906, + "language_loss": 0.85256124, + "learning_rate": 0.0008003072666844524, + "loss": 0.86334682, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.30761719, + "step": 1643, + "time_per_iteration": 2.6892380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081754, + "balance_loss_mlp": 1.05076003, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.06943709441331726, + "language_loss": 0.82542813, + "learning_rate": 0.0008000581182626173, + "loss": 0.83624566, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.30981445, + "step": 1644, + "time_per_iteration": 2.550408124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05496669, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.05777646040930187, + "language_loss": 0.86256635, + "learning_rate": 0.0007998088533478894, + "loss": 0.87341708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.30053711, + "step": 1645, + "time_per_iteration": 2.646522283554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081027, + "balance_loss_mlp": 1.05019915, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07748310873558778, + "language_loss": 0.84388101, + "learning_rate": 0.000799559472037042, + "loss": 0.85469127, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.30786133, + "step": 1646, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081594, + "balance_loss_mlp": 1.05112433, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.0644603274178606, + "language_loss": 0.87469906, + "learning_rate": 0.0007993099744268932, + "loss": 0.88551497, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.30419922, + "step": 1647, + "time_per_iteration": 2.905468225479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074972, + "balance_loss_mlp": 1.04414475, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.06139744482341488, + "language_loss": 0.87846816, + "learning_rate": 0.000799060360614307, + "loss": 0.88921791, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.30786133, + "step": 1648, + "time_per_iteration": 2.6811182498931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083311, + "balance_loss_mlp": 1.05250716, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05150264807756507, + "language_loss": 0.83281147, + "learning_rate": 0.0007988106306961917, + "loss": 0.84364462, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.30761719, + "step": 1649, + "time_per_iteration": 3.132918119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_mlp": 1.04840076, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.0787550229152594, + "language_loss": 0.84213352, + "learning_rate": 0.0007985607847695014, + "loss": 0.85291457, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.29663086, + "step": 1650, + "time_per_iteration": 2.690056085586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04784608, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.0566788479410698, + "language_loss": 0.82883936, + "learning_rate": 0.0007983108229312345, + "loss": 0.83962488, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.30664062, + "step": 1651, + "time_per_iteration": 2.918217182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.04679036, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0674507609019882, + "language_loss": 0.86496019, + "learning_rate": 0.0007980607452784351, + "loss": 0.87573761, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.30908203, + "step": 1652, + "time_per_iteration": 2.5508391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_mlp": 1.052019, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.06063063486045483, + "language_loss": 0.90349394, + "learning_rate": 0.0007978105519081919, + "loss": 0.91431332, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.29858398, + "step": 1653, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079168, + "balance_loss_mlp": 1.04910302, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.0738675373878511, + "language_loss": 0.87538201, + "learning_rate": 0.0007975602429176385, + "loss": 0.88617373, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.30004883, + "step": 1654, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05356312, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.051475836139836105, + "language_loss": 0.81585073, + "learning_rate": 0.0007973098184039536, + "loss": 0.82669556, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.30883789, + "step": 1655, + "time_per_iteration": 2.66395902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_mlp": 1.05291927, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.059751712008043044, + "language_loss": 0.86801946, + "learning_rate": 0.0007970592784643602, + "loss": 0.87885141, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.30224609, + "step": 1656, + "time_per_iteration": 2.9186086654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_mlp": 1.05855238, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.07875703275612048, + "language_loss": 0.85285407, + "learning_rate": 0.0007968086231961272, + "loss": 0.86373335, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.29321289, + "step": 1657, + "time_per_iteration": 2.6505343914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089245, + "balance_loss_mlp": 1.05941832, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08653253817480935, + "language_loss": 0.8381049, + "learning_rate": 0.0007965578526965671, + "loss": 0.84899735, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.29785156, + "step": 1658, + "time_per_iteration": 2.5884180068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089397, + "balance_loss_mlp": 1.05995274, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.05523051502884026, + "language_loss": 0.86312473, + "learning_rate": 0.0007963069670630377, + "loss": 0.87401861, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.29394531, + "step": 1659, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089678, + "balance_loss_mlp": 1.05997133, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.06732717892338919, + "language_loss": 0.8810066, + "learning_rate": 0.0007960559663929416, + "loss": 0.89190334, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.29663086, + "step": 1660, + "time_per_iteration": 2.6370737552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.06633985, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.0532651376254825, + "language_loss": 0.87495023, + "learning_rate": 0.0007958048507837259, + "loss": 0.88591546, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.30151367, + "step": 1661, + "time_per_iteration": 2.942779779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_mlp": 1.06316066, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.07710421129836972, + "language_loss": 0.87092876, + "learning_rate": 0.0007955536203328822, + "loss": 0.8818627, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.30175781, + "step": 1662, + "time_per_iteration": 2.8991520404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100595, + "balance_loss_mlp": 1.07072091, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.05380031942726595, + "language_loss": 0.8344577, + "learning_rate": 0.0007953022751379469, + "loss": 0.84546363, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.2980957, + "step": 1663, + "time_per_iteration": 2.795117139816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_mlp": 1.07239294, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.0657811186180598, + "language_loss": 0.81884921, + "learning_rate": 0.000795050815296501, + "loss": 0.82987475, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.30151367, + "step": 1664, + "time_per_iteration": 2.969935894012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099283, + "balance_loss_mlp": 1.06890798, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.058736361347452894, + "language_loss": 0.93026185, + "learning_rate": 0.0007947992409061695, + "loss": 0.94125462, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.30322266, + "step": 1665, + "time_per_iteration": 2.585144281387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06182027, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05523611327933496, + "language_loss": 0.8654207, + "learning_rate": 0.0007945475520646226, + "loss": 0.87634689, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.30761719, + "step": 1666, + "time_per_iteration": 2.9349849224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092223, + "balance_loss_mlp": 1.06249237, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.05521997897435197, + "language_loss": 0.84546125, + "learning_rate": 0.0007942957488695743, + "loss": 0.85638344, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.296875, + "step": 1667, + "time_per_iteration": 2.6538572311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.0539664, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.05331163349230756, + "language_loss": 0.81038171, + "learning_rate": 0.0007940438314187833, + "loss": 0.82121915, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.29760742, + "step": 1668, + "time_per_iteration": 3.009927988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108075, + "balance_loss_mlp": 1.05016077, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.06087879277496283, + "language_loss": 0.80221838, + "learning_rate": 0.0007937917998100529, + "loss": 0.81302583, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.30541992, + "step": 1669, + "time_per_iteration": 2.5703017711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072786, + "balance_loss_mlp": 1.0426501, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.07064769089672658, + "language_loss": 0.78527176, + "learning_rate": 0.0007935396541412302, + "loss": 0.79599965, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.30102539, + "step": 1670, + "time_per_iteration": 2.625499725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081422, + "balance_loss_mlp": 1.05099988, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.0720065018777928, + "language_loss": 0.8546167, + "learning_rate": 0.0007932873945102068, + "loss": 0.86543095, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.30395508, + "step": 1671, + "time_per_iteration": 2.6188762187957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074685, + "balance_loss_mlp": 1.05713737, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.027722134190714592, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76836461, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.17578125, + "step": 1672, + "time_per_iteration": 4.9278037548065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081072, + "balance_loss_mlp": 1.05057812, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.053011814820585035, + "language_loss": 0.86121267, + "learning_rate": 0.0007927825337533461, + "loss": 0.87202334, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.3046875, + "step": 1673, + "time_per_iteration": 2.6787123680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075926, + "balance_loss_mlp": 1.0452652, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06681709765508774, + "language_loss": 0.84770656, + "learning_rate": 0.0007925299328235131, + "loss": 0.85846579, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.30615234, + "step": 1674, + "time_per_iteration": 2.638434410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080022, + "balance_loss_mlp": 1.04890847, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.06949369164102485, + "language_loss": 0.84795958, + "learning_rate": 0.000792277218323488, + "loss": 0.85875976, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.31103516, + "step": 1675, + "time_per_iteration": 2.5852880477905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04653537, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.06490362841252771, + "language_loss": 0.84737194, + "learning_rate": 0.0007920243903513833, + "loss": 0.85814989, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.31225586, + "step": 1676, + "time_per_iteration": 2.558058261871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_mlp": 1.0523684, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.0667244817356676, + "language_loss": 0.83645618, + "learning_rate": 0.0007917714490053556, + "loss": 0.84729266, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.3125, + "step": 1677, + "time_per_iteration": 2.6619315147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.05046487, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.05833648566333407, + "language_loss": 0.85744321, + "learning_rate": 0.0007915183943836055, + "loss": 0.8682673, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.31933594, + "step": 1678, + "time_per_iteration": 2.8658525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04729617, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.06725353636254193, + "language_loss": 0.84315777, + "learning_rate": 0.0007912652265843773, + "loss": 0.8539505, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.31958008, + "step": 1679, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_mlp": 1.05019951, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.062193961969532426, + "language_loss": 0.81564045, + "learning_rate": 0.0007910119457059597, + "loss": 0.82647079, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.32836914, + "step": 1680, + "time_per_iteration": 2.6963257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05333161, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.0682304205879652, + "language_loss": 0.80304003, + "learning_rate": 0.0007907585518466849, + "loss": 0.81389421, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.32080078, + "step": 1681, + "time_per_iteration": 2.969540596008301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081665, + "balance_loss_mlp": 1.05026531, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.06175447283803796, + "language_loss": 0.89361274, + "learning_rate": 0.000790505045104929, + "loss": 0.90442938, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.3137207, + "step": 1682, + "time_per_iteration": 2.5148813724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_mlp": 1.05108356, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.061424377243362256, + "language_loss": 0.87097234, + "learning_rate": 0.0007902514255791125, + "loss": 0.88180125, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.31787109, + "step": 1683, + "time_per_iteration": 2.7773754596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078151, + "balance_loss_mlp": 1.04696608, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.06766194852988328, + "language_loss": 0.87911332, + "learning_rate": 0.0007899976933676986, + "loss": 0.88989484, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.31176758, + "step": 1684, + "time_per_iteration": 2.9700520038604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_mlp": 1.04589295, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.061649412189834635, + "language_loss": 0.87300712, + "learning_rate": 0.0007897438485691955, + "loss": 0.88378721, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.32104492, + "step": 1685, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04483223, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.06379930216662907, + "language_loss": 0.823452, + "learning_rate": 0.0007894898912821542, + "loss": 0.83422434, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.32397461, + "step": 1686, + "time_per_iteration": 2.5478906631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071757, + "balance_loss_mlp": 1.03978539, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.05321818652056826, + "language_loss": 0.86522776, + "learning_rate": 0.0007892358216051695, + "loss": 0.87594533, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.31958008, + "step": 1687, + "time_per_iteration": 2.735633134841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075777, + "balance_loss_mlp": 1.04251742, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.0608133700269358, + "language_loss": 0.91922832, + "learning_rate": 0.0007889816396368803, + "loss": 0.92998612, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.33276367, + "step": 1688, + "time_per_iteration": 2.6234939098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077878, + "balance_loss_mlp": 1.04497576, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.0630363811740232, + "language_loss": 0.85370868, + "learning_rate": 0.0007887273454759687, + "loss": 0.86448747, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.32910156, + "step": 1689, + "time_per_iteration": 2.4698379039764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074583, + "balance_loss_mlp": 1.04184794, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.06604183912716106, + "language_loss": 0.82445431, + "learning_rate": 0.0007884729392211603, + "loss": 0.83520007, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.32739258, + "step": 1690, + "time_per_iteration": 2.6488864421844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.04920113, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06849578130600678, + "language_loss": 0.85280114, + "learning_rate": 0.0007882184209712245, + "loss": 0.86361718, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.32397461, + "step": 1691, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080531, + "balance_loss_mlp": 1.04874992, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.06225581397596747, + "language_loss": 0.8573736, + "learning_rate": 0.000787963790824974, + "loss": 0.8681789, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.31762695, + "step": 1692, + "time_per_iteration": 2.9696617126464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06054115, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.0857009989212748, + "language_loss": 0.89660913, + "learning_rate": 0.0007877090488812651, + "loss": 0.90753233, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.31762695, + "step": 1693, + "time_per_iteration": 2.431861639022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_mlp": 1.05553031, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.07076453254267401, + "language_loss": 0.8368417, + "learning_rate": 0.0007874541952389973, + "loss": 0.84770912, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.31176758, + "step": 1694, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_mlp": 1.05293202, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.060562687008333366, + "language_loss": 0.86582285, + "learning_rate": 0.0007871992299971136, + "loss": 0.87666881, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.31640625, + "step": 1695, + "time_per_iteration": 2.553171396255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_mlp": 1.0608871, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.05969457295977618, + "language_loss": 0.84301764, + "learning_rate": 0.0007869441532546001, + "loss": 0.85394001, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.31323242, + "step": 1696, + "time_per_iteration": 2.752049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.06247652, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05927141137383595, + "language_loss": 0.79686946, + "learning_rate": 0.0007866889651104867, + "loss": 0.80780673, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.31225586, + "step": 1697, + "time_per_iteration": 2.7691686153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109533, + "balance_loss_mlp": 1.06388259, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.0715366482234757, + "language_loss": 0.83218181, + "learning_rate": 0.000786433665663846, + "loss": 0.84313512, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.31420898, + "step": 1698, + "time_per_iteration": 2.717372179031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098821, + "balance_loss_mlp": 1.06816053, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.05645489658390659, + "language_loss": 0.86431837, + "learning_rate": 0.0007861782550137942, + "loss": 0.87530661, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.30615234, + "step": 1699, + "time_per_iteration": 2.9035465717315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_mlp": 1.07394195, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.11170286971508382, + "language_loss": 0.85853553, + "learning_rate": 0.0007859227332594901, + "loss": 0.86957312, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.29785156, + "step": 1700, + "time_per_iteration": 2.9302797317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093978, + "balance_loss_mlp": 1.06508183, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.07200471053268022, + "language_loss": 0.84801477, + "learning_rate": 0.0007856671005001365, + "loss": 0.85895455, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.28881836, + "step": 1701, + "time_per_iteration": 3.1760013103485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090985, + "balance_loss_mlp": 1.06225514, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.07453437515979243, + "language_loss": 0.81870627, + "learning_rate": 0.0007854113568349787, + "loss": 0.82961613, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.28686523, + "step": 1702, + "time_per_iteration": 3.1038365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_mlp": 1.05770779, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.07528598974040544, + "language_loss": 0.80317354, + "learning_rate": 0.0007851555023633052, + "loss": 0.81405228, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.30102539, + "step": 1703, + "time_per_iteration": 2.847515106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.0558784, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.08040178147570827, + "language_loss": 0.82301831, + "learning_rate": 0.0007848995371844474, + "loss": 0.83387053, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.29296875, + "step": 1704, + "time_per_iteration": 2.5442426204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098029, + "balance_loss_mlp": 1.06872725, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06101842979524802, + "language_loss": 0.80441558, + "learning_rate": 0.0007846434613977801, + "loss": 0.81539583, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.29296875, + "step": 1705, + "time_per_iteration": 2.5023465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091561, + "balance_loss_mlp": 1.06242633, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.07007502801083235, + "language_loss": 0.78621399, + "learning_rate": 0.0007843872751027203, + "loss": 0.79712963, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.29125977, + "step": 1706, + "time_per_iteration": 2.790001392364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094895, + "balance_loss_mlp": 1.06549811, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.05836443006497643, + "language_loss": 0.87259293, + "learning_rate": 0.0007841309783987287, + "loss": 0.88354194, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.29345703, + "step": 1707, + "time_per_iteration": 2.7478153705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097713, + "balance_loss_mlp": 1.0684588, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.05888352709782848, + "language_loss": 0.89055538, + "learning_rate": 0.0007838745713853084, + "loss": 0.90153247, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.29199219, + "step": 1708, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088275, + "balance_loss_mlp": 1.05925906, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.06397878577513526, + "language_loss": 0.8386358, + "learning_rate": 0.0007836180541620053, + "loss": 0.8495186, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.29003906, + "step": 1709, + "time_per_iteration": 2.7023067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_mlp": 1.06191421, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.05521592697878337, + "language_loss": 0.86435962, + "learning_rate": 0.0007833614268284082, + "loss": 0.87527102, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.29199219, + "step": 1710, + "time_per_iteration": 2.538080930709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_mlp": 1.0721513, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.029520146980468998, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75200427, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.18457031, + "step": 1711, + "time_per_iteration": 4.909448862075806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05965161, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.07803051984240059, + "language_loss": 0.78501904, + "learning_rate": 0.0007828478422289016, + "loss": 0.79591095, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.29492188, + "step": 1712, + "time_per_iteration": 2.5883195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092173, + "balance_loss_mlp": 1.06210816, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05953292046858541, + "language_loss": 0.88987601, + "learning_rate": 0.0007825908851623833, + "loss": 0.90079772, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.30004883, + "step": 1713, + "time_per_iteration": 2.7441718578338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089127, + "balance_loss_mlp": 1.05973005, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06609176393308323, + "language_loss": 0.8478905, + "learning_rate": 0.0007823338183843533, + "loss": 0.85878181, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.29394531, + "step": 1714, + "time_per_iteration": 2.6771602630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.06291747, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.10875146541446083, + "language_loss": 0.80569458, + "learning_rate": 0.0007820766419946141, + "loss": 0.81661701, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.29321289, + "step": 1715, + "time_per_iteration": 3.3068225383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_mlp": 1.07052732, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.03503617860008252, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760461, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.17480469, + "step": 1716, + "time_per_iteration": 5.048320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_mlp": 1.06201911, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.06576145610663801, + "language_loss": 0.76379126, + "learning_rate": 0.0007815619607794288, + "loss": 0.77470231, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.29052734, + "step": 1717, + "time_per_iteration": 2.6151187419891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094733, + "balance_loss_mlp": 1.06440604, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.08930544150493325, + "language_loss": 0.82491159, + "learning_rate": 0.0007813044561538001, + "loss": 0.835859, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.30273438, + "step": 1718, + "time_per_iteration": 3.1329195499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089209, + "balance_loss_mlp": 1.05928707, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.06440748712139703, + "language_loss": 0.88832355, + "learning_rate": 0.0007810468423160958, + "loss": 0.8992157, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.29882812, + "step": 1719, + "time_per_iteration": 2.8785343170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_mlp": 1.06195092, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.05842798757545397, + "language_loss": 0.81825691, + "learning_rate": 0.0007807891193663306, + "loss": 0.82917207, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.29492188, + "step": 1720, + "time_per_iteration": 2.775949478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.05956948, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.1056737351826848, + "language_loss": 0.82154363, + "learning_rate": 0.0007805312874045614, + "loss": 0.83243477, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.29516602, + "step": 1721, + "time_per_iteration": 2.528573513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.06054103, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.06879892565652022, + "language_loss": 0.86894739, + "learning_rate": 0.0007802733465308874, + "loss": 0.87984586, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.29272461, + "step": 1722, + "time_per_iteration": 2.4575133323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.05811512, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.06801648197756033, + "language_loss": 0.84311831, + "learning_rate": 0.0007800152968454501, + "loss": 0.85398912, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.28930664, + "step": 1723, + "time_per_iteration": 2.729114294052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_mlp": 1.06300533, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.049597969001903774, + "language_loss": 0.90648681, + "learning_rate": 0.0007797571384484334, + "loss": 0.91740465, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.28759766, + "step": 1724, + "time_per_iteration": 2.8813512325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_mlp": 1.05463219, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.060917196813517045, + "language_loss": 0.91917408, + "learning_rate": 0.0007794988714400633, + "loss": 0.9300158, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.29516602, + "step": 1725, + "time_per_iteration": 2.6094837188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_mlp": 1.05896294, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.06883363868640566, + "language_loss": 0.85331756, + "learning_rate": 0.0007792404959206079, + "loss": 0.86420023, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.29272461, + "step": 1726, + "time_per_iteration": 2.4982993602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_mlp": 1.05396366, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.0595205364190525, + "language_loss": 0.81498575, + "learning_rate": 0.0007789820119903774, + "loss": 0.82581604, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.29052734, + "step": 1727, + "time_per_iteration": 2.9797775745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04043114, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.028746370774938412, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552454, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.19335938, + "step": 1728, + "time_per_iteration": 4.892562627792358 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_mlp": 1.05982828, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.10868743625457102, + "language_loss": 0.83712173, + "learning_rate": 0.0007784647192990428, + "loss": 0.84802401, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.3034668, + "step": 1729, + "time_per_iteration": 2.721163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093021, + "balance_loss_mlp": 1.06283677, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.06834187729314575, + "language_loss": 0.80591226, + "learning_rate": 0.0007782059107387696, + "loss": 0.81684244, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.30151367, + "step": 1730, + "time_per_iteration": 2.8358583450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097893, + "balance_loss_mlp": 1.06768548, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.06518025115488765, + "language_loss": 0.88646144, + "learning_rate": 0.0007779469941693826, + "loss": 0.89744031, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.30175781, + "step": 1731, + "time_per_iteration": 2.8069489002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105874, + "balance_loss_mlp": 1.0744741, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.0738487456517703, + "language_loss": 0.76712036, + "learning_rate": 0.0007776879696914029, + "loss": 0.77817911, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.3137207, + "step": 1732, + "time_per_iteration": 2.8068690299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08479202, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.06155067702851775, + "language_loss": 0.88390094, + "learning_rate": 0.000777428837405392, + "loss": 0.89506716, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.31811523, + "step": 1733, + "time_per_iteration": 2.8412673473358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_mlp": 1.07530773, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.0682339524169846, + "language_loss": 0.86804128, + "learning_rate": 0.0007771695974119544, + "loss": 0.87911332, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.31884766, + "step": 1734, + "time_per_iteration": 2.512354612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_mlp": 1.07159579, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.0845052703087739, + "language_loss": 0.75201118, + "learning_rate": 0.0007769102498117359, + "loss": 0.7630502, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.32299805, + "step": 1735, + "time_per_iteration": 3.107100248336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_mlp": 1.05777764, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.061332510780765306, + "language_loss": 0.79977, + "learning_rate": 0.000776650794705424, + "loss": 0.81067985, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33227539, + "step": 1736, + "time_per_iteration": 3.259875535964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092848, + "balance_loss_mlp": 1.06116199, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.05236613872795896, + "language_loss": 0.82229674, + "learning_rate": 0.0007763912321937483, + "loss": 0.83322519, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.31665039, + "step": 1737, + "time_per_iteration": 2.704059600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088373, + "balance_loss_mlp": 1.05506587, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.07890071498287932, + "language_loss": 0.82297349, + "learning_rate": 0.0007761315623774799, + "loss": 0.83385718, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33325195, + "step": 1738, + "time_per_iteration": 3.399148464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.0574522, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.09967891290955513, + "language_loss": 0.87632757, + "learning_rate": 0.0007758717853574313, + "loss": 0.88722181, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.31958008, + "step": 1739, + "time_per_iteration": 2.772089958190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103829, + "balance_loss_mlp": 1.0729773, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06672668023604937, + "language_loss": 0.90074134, + "learning_rate": 0.0007756119012344571, + "loss": 0.91177964, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.30810547, + "step": 1740, + "time_per_iteration": 2.5482232570648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_mlp": 1.07707, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.07840140242610649, + "language_loss": 0.84438574, + "learning_rate": 0.0007753519101094535, + "loss": 0.85546857, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.31176758, + "step": 1741, + "time_per_iteration": 2.749004602432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_mlp": 1.07173228, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.07002932741488781, + "language_loss": 0.86241812, + "learning_rate": 0.0007750918120833575, + "loss": 0.87343943, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.3034668, + "step": 1742, + "time_per_iteration": 2.600731611251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_mlp": 1.0753479, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.07258867640739639, + "language_loss": 0.87368989, + "learning_rate": 0.0007748316072571485, + "loss": 0.88474762, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.30395508, + "step": 1743, + "time_per_iteration": 2.7698371410369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_mlp": 1.07902408, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.05763877458348602, + "language_loss": 0.79041934, + "learning_rate": 0.0007745712957318467, + "loss": 0.80151671, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.30664062, + "step": 1744, + "time_per_iteration": 2.967310667037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_mlp": 1.07412386, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.052786515694630796, + "language_loss": 0.86410165, + "learning_rate": 0.0007743108776085141, + "loss": 0.87514448, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.30102539, + "step": 1745, + "time_per_iteration": 2.771803855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_mlp": 1.07049131, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.06089020802257528, + "language_loss": 0.82798052, + "learning_rate": 0.0007740503529882543, + "loss": 0.83900565, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.32006836, + "step": 1746, + "time_per_iteration": 2.805392026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095402, + "balance_loss_mlp": 1.064551, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.0569869068698716, + "language_loss": 0.90718448, + "learning_rate": 0.0007737897219722114, + "loss": 0.9181385, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.30810547, + "step": 1747, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.05970204, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.07943976371979472, + "language_loss": 0.80688596, + "learning_rate": 0.0007735289846615716, + "loss": 0.81779456, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.31152344, + "step": 1748, + "time_per_iteration": 2.6637260913848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094297, + "balance_loss_mlp": 1.06356478, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.06884386609789231, + "language_loss": 0.81979561, + "learning_rate": 0.0007732681411575621, + "loss": 0.83073854, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.30712891, + "step": 1749, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.0555166, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.052237930998467595, + "language_loss": 0.87234819, + "learning_rate": 0.0007730071915614514, + "loss": 0.88321906, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.31542969, + "step": 1750, + "time_per_iteration": 2.707857370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_mlp": 1.05896115, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.08336153438972979, + "language_loss": 0.88963622, + "learning_rate": 0.0007727461359745489, + "loss": 0.90053463, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.30859375, + "step": 1751, + "time_per_iteration": 2.482837438583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093668, + "balance_loss_mlp": 1.06307864, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05330176149069141, + "language_loss": 0.86016554, + "learning_rate": 0.0007724849744982056, + "loss": 0.87110221, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.30541992, + "step": 1752, + "time_per_iteration": 2.690420389175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097033, + "balance_loss_mlp": 1.06668198, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.0643678921459399, + "language_loss": 0.81981385, + "learning_rate": 0.0007722237072338131, + "loss": 0.8307842, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.30322266, + "step": 1753, + "time_per_iteration": 2.7154347896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097395, + "balance_loss_mlp": 1.06694901, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.07107791288081117, + "language_loss": 0.85213387, + "learning_rate": 0.0007719623342828046, + "loss": 0.8631078, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.30419922, + "step": 1754, + "time_per_iteration": 2.5009355545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109586, + "balance_loss_mlp": 1.06426978, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.06326183968549627, + "language_loss": 0.84134084, + "learning_rate": 0.000771700855746654, + "loss": 0.85229945, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.31567383, + "step": 1755, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082281, + "balance_loss_mlp": 1.05071473, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.06130822269954804, + "language_loss": 0.88395244, + "learning_rate": 0.0007714392717268763, + "loss": 0.89477527, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.31542969, + "step": 1756, + "time_per_iteration": 2.6147336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_mlp": 1.05219221, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.05731341996908033, + "language_loss": 0.86388242, + "learning_rate": 0.0007711775823250273, + "loss": 0.87471741, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.31298828, + "step": 1757, + "time_per_iteration": 2.5304934978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085861, + "balance_loss_mlp": 1.05455685, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.061357664780502266, + "language_loss": 0.83481395, + "learning_rate": 0.0007709157876427039, + "loss": 0.84567261, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.31274414, + "step": 1758, + "time_per_iteration": 3.1116981506347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_mlp": 1.04189849, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0592835704233285, + "language_loss": 0.85574573, + "learning_rate": 0.0007706538877815439, + "loss": 0.86648774, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.32299805, + "step": 1759, + "time_per_iteration": 2.635298728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_mlp": 1.04730105, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.04672826561746397, + "language_loss": 0.83449262, + "learning_rate": 0.0007703918828432259, + "loss": 0.84527004, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.30419922, + "step": 1760, + "time_per_iteration": 2.664783477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071091, + "balance_loss_mlp": 1.04023945, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.061026274734732225, + "language_loss": 0.88914752, + "learning_rate": 0.000770129772929469, + "loss": 0.89985847, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.30810547, + "step": 1761, + "time_per_iteration": 2.7082738876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_mlp": 1.03914273, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.058866792995701266, + "language_loss": 0.88234216, + "learning_rate": 0.0007698675581420334, + "loss": 0.89304519, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.3112793, + "step": 1762, + "time_per_iteration": 2.9119746685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.03966177, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.06738514708484569, + "language_loss": 0.78819811, + "learning_rate": 0.0007696052385827199, + "loss": 0.79890805, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.31298828, + "step": 1763, + "time_per_iteration": 2.9451980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107403, + "balance_loss_mlp": 1.04172421, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.0719800357998311, + "language_loss": 0.78192145, + "learning_rate": 0.00076934281435337, + "loss": 0.79266179, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.32299805, + "step": 1764, + "time_per_iteration": 2.8267600536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.03931201, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.06414673033674093, + "language_loss": 0.85701221, + "learning_rate": 0.0007690802855558658, + "loss": 0.86773127, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.32592773, + "step": 1765, + "time_per_iteration": 2.8825321197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_mlp": 1.04322386, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.027152559638010845, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.7743544, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.17285156, + "step": 1766, + "time_per_iteration": 4.890359401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04684353, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.06170687350837257, + "language_loss": 0.89089799, + "learning_rate": 0.0007685549146641262, + "loss": 0.90168703, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.32055664, + "step": 1767, + "time_per_iteration": 2.539238691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.04557216, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05571629344022593, + "language_loss": 0.8822673, + "learning_rate": 0.0007682920727738579, + "loss": 0.89303821, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.31494141, + "step": 1768, + "time_per_iteration": 2.512801170349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.04931498, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06175400371418068, + "language_loss": 0.8474735, + "learning_rate": 0.000768029126723369, + "loss": 0.85827971, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.31274414, + "step": 1769, + "time_per_iteration": 2.5238869190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075433, + "balance_loss_mlp": 1.04515338, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.06596681609056877, + "language_loss": 0.81544566, + "learning_rate": 0.0007677660766147447, + "loss": 0.82620001, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.30224609, + "step": 1770, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_mlp": 1.02063394, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.014856007486746849, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73508459, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.16894531, + "step": 1771, + "time_per_iteration": 4.967731475830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05113387, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.075322249241395, + "language_loss": 0.79792535, + "learning_rate": 0.0007672396646316306, + "loss": 0.8087405, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.30322266, + "step": 1772, + "time_per_iteration": 2.524365186691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_mlp": 1.05451918, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.05910937608565349, + "language_loss": 0.80291271, + "learning_rate": 0.000766976302961512, + "loss": 0.81376183, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.30371094, + "step": 1773, + "time_per_iteration": 3.002929925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_mlp": 1.0563519, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.0625889066862488, + "language_loss": 0.81081951, + "learning_rate": 0.0007667128376420003, + "loss": 0.82168746, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.30395508, + "step": 1774, + "time_per_iteration": 2.5821964740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_mlp": 1.05336761, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.06267075227744807, + "language_loss": 0.84329379, + "learning_rate": 0.0007664492687753817, + "loss": 0.85412979, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.30175781, + "step": 1775, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04769528, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.054581176728495925, + "language_loss": 0.81518859, + "learning_rate": 0.000766185596463983, + "loss": 0.8259607, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.29516602, + "step": 1776, + "time_per_iteration": 2.655543804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_mlp": 1.04993343, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.06969464274274284, + "language_loss": 0.76725864, + "learning_rate": 0.0007659218208101706, + "loss": 0.77804863, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.29003906, + "step": 1777, + "time_per_iteration": 3.1378567218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06411862, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.0529989301900612, + "language_loss": 0.84699291, + "learning_rate": 0.0007656579419163515, + "loss": 0.85792446, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.29052734, + "step": 1778, + "time_per_iteration": 2.8120994567871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091459, + "balance_loss_mlp": 1.06239629, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.06282493199141514, + "language_loss": 0.76994503, + "learning_rate": 0.0007653939598849724, + "loss": 0.78085959, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.2902832, + "step": 1779, + "time_per_iteration": 2.5995492935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.07051396, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.04507156484415478, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83967406, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16699219, + "step": 1780, + "time_per_iteration": 4.9175097942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_mlp": 1.07186341, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.05745476314946865, + "language_loss": 0.79740059, + "learning_rate": 0.000764865686819522, + "loss": 0.80842102, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.30151367, + "step": 1781, + "time_per_iteration": 3.1022064685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.06907511, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.061017866945560745, + "language_loss": 0.85627258, + "learning_rate": 0.0007646013959905449, + "loss": 0.8672511, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.28759766, + "step": 1782, + "time_per_iteration": 2.625312566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090603, + "balance_loss_mlp": 1.06030035, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05493462983431466, + "language_loss": 0.80768538, + "learning_rate": 0.0007643370024341949, + "loss": 0.81859136, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.30249023, + "step": 1783, + "time_per_iteration": 3.1206953525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_mlp": 1.06284761, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.04934338548004703, + "language_loss": 0.8289808, + "learning_rate": 0.0007640725062531195, + "loss": 0.83990133, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.29174805, + "step": 1784, + "time_per_iteration": 2.518277645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092006, + "balance_loss_mlp": 1.06165504, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.061838155255473454, + "language_loss": 0.8616311, + "learning_rate": 0.0007638079075500047, + "loss": 0.8725512, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.30297852, + "step": 1785, + "time_per_iteration": 2.566340684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056366, + "balance_loss_mlp": 1.04101145, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.03141321768780463, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76237035, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.15332031, + "step": 1786, + "time_per_iteration": 4.984891891479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_mlp": 1.05088782, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.0502662811310507, + "language_loss": 0.83153242, + "learning_rate": 0.0007632784029886026, + "loss": 0.84235144, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.30981445, + "step": 1787, + "time_per_iteration": 2.6574935913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_mlp": 1.04832625, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.058652751735253, + "language_loss": 0.85391539, + "learning_rate": 0.0007630134973358873, + "loss": 0.86470503, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.3059082, + "step": 1788, + "time_per_iteration": 2.920311450958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_mlp": 1.05702209, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05633660644162356, + "language_loss": 0.86888337, + "learning_rate": 0.0007627484895722763, + "loss": 0.87976426, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.31030273, + "step": 1789, + "time_per_iteration": 2.648061513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.05268025, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.08125120447961011, + "language_loss": 0.79987907, + "learning_rate": 0.0007624833798006552, + "loss": 0.8107022, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.29614258, + "step": 1790, + "time_per_iteration": 3.083303689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_mlp": 1.05249596, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.06337905919609309, + "language_loss": 0.83924425, + "learning_rate": 0.0007622181681239483, + "loss": 0.85006905, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.29931641, + "step": 1791, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_mlp": 1.04677427, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.05139164694864183, + "language_loss": 0.84563744, + "learning_rate": 0.0007619528546451202, + "loss": 0.85641772, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.31225586, + "step": 1792, + "time_per_iteration": 2.7847092151641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.05183685, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.060391852587241154, + "language_loss": 0.8357141, + "learning_rate": 0.0007616874394671745, + "loss": 0.84653878, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.3059082, + "step": 1793, + "time_per_iteration": 3.3427343368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05632687, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.07229882199780847, + "language_loss": 0.85033429, + "learning_rate": 0.0007614219226931547, + "loss": 0.86121154, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.3137207, + "step": 1794, + "time_per_iteration": 2.6797611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090025, + "balance_loss_mlp": 1.05931664, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.057715322830613675, + "language_loss": 0.84206641, + "learning_rate": 0.0007611563044261435, + "loss": 0.85296667, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.30664062, + "step": 1795, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_mlp": 1.05543017, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.06328741897936851, + "language_loss": 0.86560625, + "learning_rate": 0.0007608905847692631, + "loss": 0.87647337, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.3125, + "step": 1796, + "time_per_iteration": 2.472182035446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081946, + "balance_loss_mlp": 1.05014098, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.053847624873276365, + "language_loss": 0.86582637, + "learning_rate": 0.0007606247638256749, + "loss": 0.8766458, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.31787109, + "step": 1797, + "time_per_iteration": 2.842547655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147955, + "balance_loss_mlp": 1.13145602, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.06482996241123744, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79318249, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.16503906, + "step": 1798, + "time_per_iteration": 4.918993949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075567, + "balance_loss_mlp": 1.06011796, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.04230684388330953, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80402768, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.15429688, + "step": 1799, + "time_per_iteration": 4.791706323623657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.04724216, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.06124115711212235, + "language_loss": 0.85762143, + "learning_rate": 0.0007598266943068686, + "loss": 0.86839759, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.30322266, + "step": 1800, + "time_per_iteration": 2.743213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_mlp": 1.05266404, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.13184352245004016, + "language_loss": 0.83900499, + "learning_rate": 0.0007595604692488507, + "loss": 0.84984374, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31176758, + "step": 1801, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05105186, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.0617697315453188, + "language_loss": 0.82875979, + "learning_rate": 0.0007592941434205215, + "loss": 0.83958554, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.31494141, + "step": 1802, + "time_per_iteration": 2.803941488265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_mlp": 1.06292093, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.03209988868756776, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74648476, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.14453125, + "step": 1803, + "time_per_iteration": 5.115894794464111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073735, + "balance_loss_mlp": 1.04176331, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.057797440709038125, + "language_loss": 0.7980904, + "learning_rate": 0.0007587611898665566, + "loss": 0.80882776, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.31958008, + "step": 1804, + "time_per_iteration": 3.0783464908599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_mlp": 1.04958522, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.052922401600576395, + "language_loss": 0.8228178, + "learning_rate": 0.0007584945623478315, + "loss": 0.83362216, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.30810547, + "step": 1805, + "time_per_iteration": 2.8341996669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107388, + "balance_loss_mlp": 1.04178858, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.05986711270473425, + "language_loss": 0.81165981, + "learning_rate": 0.000758227834472617, + "loss": 0.82239866, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32080078, + "step": 1806, + "time_per_iteration": 3.0486085414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.04971278, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.06433807190471491, + "language_loss": 0.77163357, + "learning_rate": 0.0007579610063444664, + "loss": 0.78245926, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.32861328, + "step": 1807, + "time_per_iteration": 2.7597365379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073013, + "balance_loss_mlp": 1.04068375, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.06573509148212295, + "language_loss": 0.8740322, + "learning_rate": 0.0007576940780669712, + "loss": 0.88476229, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32324219, + "step": 1808, + "time_per_iteration": 3.2193737030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.04060304, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.07068655640298144, + "language_loss": 0.84018815, + "learning_rate": 0.0007574270497437624, + "loss": 0.85092652, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33251953, + "step": 1809, + "time_per_iteration": 2.958071708679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04255509, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.05267537563651592, + "language_loss": 0.88190216, + "learning_rate": 0.000757159921478509, + "loss": 0.89264333, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.31542969, + "step": 1810, + "time_per_iteration": 2.743820905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_mlp": 1.10993648, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.032772528197798495, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75575733, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.15136719, + "step": 1811, + "time_per_iteration": 4.734825372695923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_mlp": 1.04713607, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.06138203683055377, + "language_loss": 0.87334222, + "learning_rate": 0.0007566253655367423, + "loss": 0.88411689, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.30273438, + "step": 1812, + "time_per_iteration": 2.5963358879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.04946637, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.05073723218815133, + "language_loss": 0.89626348, + "learning_rate": 0.000756357938067762, + "loss": 0.90707672, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.31835938, + "step": 1813, + "time_per_iteration": 2.6791560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088512, + "balance_loss_mlp": 1.05615854, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.07107132576327291, + "language_loss": 0.82739902, + "learning_rate": 0.0007560904110718033, + "loss": 0.83828408, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32324219, + "step": 1814, + "time_per_iteration": 3.251187801361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05244136, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.056660731031110724, + "language_loss": 0.83390886, + "learning_rate": 0.0007558227846527297, + "loss": 0.84475422, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.32080078, + "step": 1815, + "time_per_iteration": 2.852786064147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086181, + "balance_loss_mlp": 1.05358887, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.06752757018776132, + "language_loss": 0.83192128, + "learning_rate": 0.0007555550589144429, + "loss": 0.84278309, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.32592773, + "step": 1816, + "time_per_iteration": 2.4226694107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108673, + "balance_loss_mlp": 1.05568814, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.05637535729014081, + "language_loss": 0.84440207, + "learning_rate": 0.000755287233960883, + "loss": 0.85526937, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.31005859, + "step": 1817, + "time_per_iteration": 2.556528329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081988, + "balance_loss_mlp": 1.04963493, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06861190177202381, + "language_loss": 0.77555025, + "learning_rate": 0.0007550193098960292, + "loss": 0.7863701, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32348633, + "step": 1818, + "time_per_iteration": 2.9168636798858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_mlp": 1.04902124, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.04890635253674866, + "language_loss": 0.85897982, + "learning_rate": 0.0007547512868238988, + "loss": 0.86979043, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.3203125, + "step": 1819, + "time_per_iteration": 3.147949695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_mlp": 1.05583739, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.07359678742691168, + "language_loss": 0.83527619, + "learning_rate": 0.0007544831648485473, + "loss": 0.84614623, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.3112793, + "step": 1820, + "time_per_iteration": 2.683906078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_mlp": 1.05272126, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.07119738396785501, + "language_loss": 0.81087327, + "learning_rate": 0.0007542149440740694, + "loss": 0.82171333, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.3125, + "step": 1821, + "time_per_iteration": 2.738029718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107983, + "balance_loss_mlp": 1.04850197, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.07229829340096756, + "language_loss": 0.8569001, + "learning_rate": 0.000753946624604597, + "loss": 0.86769843, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.31298828, + "step": 1822, + "time_per_iteration": 2.7263731956481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079169, + "balance_loss_mlp": 1.04795969, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.05660966900473529, + "language_loss": 0.87968546, + "learning_rate": 0.0007536782065443015, + "loss": 0.89047718, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.31176758, + "step": 1823, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_mlp": 1.05386138, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06227259781784348, + "language_loss": 0.74483079, + "learning_rate": 0.0007534096899973919, + "loss": 0.75567335, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.3034668, + "step": 1824, + "time_per_iteration": 2.609548807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_mlp": 1.04804349, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05520550621954613, + "language_loss": 0.82636261, + "learning_rate": 0.0007531410750681154, + "loss": 0.83715534, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.31201172, + "step": 1825, + "time_per_iteration": 2.7306325435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094474, + "balance_loss_mlp": 1.06352782, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.04890512262044313, + "language_loss": 0.86351258, + "learning_rate": 0.0007528723618607575, + "loss": 0.8744573, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.30908203, + "step": 1826, + "time_per_iteration": 3.4343338012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088582, + "balance_loss_mlp": 1.05782557, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.05382597898667073, + "language_loss": 0.82364488, + "learning_rate": 0.0007526035504796422, + "loss": 0.83453071, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.30737305, + "step": 1827, + "time_per_iteration": 2.7783889770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_mlp": 1.05721426, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.07196751046410012, + "language_loss": 0.86701363, + "learning_rate": 0.0007523346410291312, + "loss": 0.87790149, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.31542969, + "step": 1828, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096578, + "balance_loss_mlp": 1.06434393, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.05953464089235074, + "language_loss": 0.84491026, + "learning_rate": 0.0007520656336136245, + "loss": 0.85587609, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32226562, + "step": 1829, + "time_per_iteration": 2.9498770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095972, + "balance_loss_mlp": 1.0648104, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.05500553487662277, + "language_loss": 0.87983966, + "learning_rate": 0.0007517965283375599, + "loss": 0.89079928, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.3112793, + "step": 1830, + "time_per_iteration": 2.838120698928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097926, + "balance_loss_mlp": 1.06566763, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.053691241766720514, + "language_loss": 0.89336729, + "learning_rate": 0.0007515273253054132, + "loss": 0.90434659, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32250977, + "step": 1831, + "time_per_iteration": 2.6600866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092956, + "balance_loss_mlp": 1.06191444, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.05928754583625919, + "language_loss": 0.82674569, + "learning_rate": 0.0007512580246216988, + "loss": 0.83767527, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.31005859, + "step": 1832, + "time_per_iteration": 2.7806639671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089641, + "balance_loss_mlp": 1.05752611, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.0631616677310412, + "language_loss": 0.84810489, + "learning_rate": 0.000750988626390968, + "loss": 0.85900134, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32104492, + "step": 1833, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087885, + "balance_loss_mlp": 1.0560801, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.053730319302775706, + "language_loss": 0.84857321, + "learning_rate": 0.0007507191307178108, + "loss": 0.85945207, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.31787109, + "step": 1834, + "time_per_iteration": 2.822472095489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05785227, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.07238185360826516, + "language_loss": 0.74172056, + "learning_rate": 0.0007504495377068543, + "loss": 0.75260878, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.30932617, + "step": 1835, + "time_per_iteration": 2.758622884750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.06250441, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06860617015764896, + "language_loss": 0.81217551, + "learning_rate": 0.0007501798474627642, + "loss": 0.82311678, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.31591797, + "step": 1836, + "time_per_iteration": 2.932610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.06568563, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.06442397939494823, + "language_loss": 0.83527768, + "learning_rate": 0.0007499100600902433, + "loss": 0.8462323, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.29736328, + "step": 1837, + "time_per_iteration": 3.0089991092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089306, + "balance_loss_mlp": 1.05845428, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06893251529793973, + "language_loss": 0.83798671, + "learning_rate": 0.0007496401756940324, + "loss": 0.84887969, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.30810547, + "step": 1838, + "time_per_iteration": 2.6746418476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.06029606, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.06403380726847299, + "language_loss": 0.82561135, + "learning_rate": 0.0007493701943789098, + "loss": 0.83651948, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.3046875, + "step": 1839, + "time_per_iteration": 2.7678062915802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092399, + "balance_loss_mlp": 1.06307316, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.057234368489623245, + "language_loss": 0.82641804, + "learning_rate": 0.000749100116249692, + "loss": 0.83734202, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.29272461, + "step": 1840, + "time_per_iteration": 2.6124982833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_mlp": 1.0616498, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.09225915028059628, + "language_loss": 0.86273944, + "learning_rate": 0.0007488299414112321, + "loss": 0.87365901, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.30249023, + "step": 1841, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087223, + "balance_loss_mlp": 1.05737281, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.0557731038759208, + "language_loss": 0.77796137, + "learning_rate": 0.0007485596699684215, + "loss": 0.78883362, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.2980957, + "step": 1842, + "time_per_iteration": 2.83414626121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_mlp": 1.05561948, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.04938820360777142, + "language_loss": 0.85113978, + "learning_rate": 0.000748289302026189, + "loss": 0.86201257, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.31640625, + "step": 1843, + "time_per_iteration": 2.8805251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_mlp": 1.05403841, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06499404847276229, + "language_loss": 0.85830677, + "learning_rate": 0.0007480188376895004, + "loss": 0.86915159, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.30395508, + "step": 1844, + "time_per_iteration": 3.0965142250061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_mlp": 1.04624832, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.026974392702602535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74874085, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.16503906, + "step": 1845, + "time_per_iteration": 5.003226280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.05738342, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.11496133406812095, + "language_loss": 0.78570682, + "learning_rate": 0.0007474776202528074, + "loss": 0.79659295, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.31201172, + "step": 1846, + "time_per_iteration": 2.9579098224639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089072, + "balance_loss_mlp": 1.05736208, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.06294098896241457, + "language_loss": 0.81369591, + "learning_rate": 0.000747206867362922, + "loss": 0.82458663, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.31689453, + "step": 1847, + "time_per_iteration": 3.0886905193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109789, + "balance_loss_mlp": 1.06656218, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.060378794046525276, + "language_loss": 0.83593512, + "learning_rate": 0.0007469360184988194, + "loss": 0.84691405, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.31298828, + "step": 1848, + "time_per_iteration": 2.861438512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109845, + "balance_loss_mlp": 1.06724131, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.06250375704468988, + "language_loss": 0.86663848, + "learning_rate": 0.0007466650737656518, + "loss": 0.87762296, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.31176758, + "step": 1849, + "time_per_iteration": 2.620384454727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098996, + "balance_loss_mlp": 1.06754851, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05619364173691644, + "language_loss": 0.90150386, + "learning_rate": 0.0007463940332686098, + "loss": 0.91249382, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.31420898, + "step": 1850, + "time_per_iteration": 2.499337911605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_mlp": 1.06711888, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.05220134930851383, + "language_loss": 0.8454684, + "learning_rate": 0.0007461228971129205, + "loss": 0.85644454, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.30444336, + "step": 1851, + "time_per_iteration": 2.91583251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_mlp": 1.06049538, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.06507053577711389, + "language_loss": 0.85374135, + "learning_rate": 0.0007458516654038483, + "loss": 0.8646493, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.30297852, + "step": 1852, + "time_per_iteration": 2.710845947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06221175, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.055267605083424515, + "language_loss": 0.86826843, + "learning_rate": 0.0007455803382466946, + "loss": 0.87919998, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.30908203, + "step": 1853, + "time_per_iteration": 2.8157601356506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089896, + "balance_loss_mlp": 1.05894923, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.06143674576014299, + "language_loss": 0.87150055, + "learning_rate": 0.0007453089157467979, + "loss": 0.8823995, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.30908203, + "step": 1854, + "time_per_iteration": 2.7985024452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_mlp": 1.06946826, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.06203911404438901, + "language_loss": 0.82222199, + "learning_rate": 0.0007450373980095341, + "loss": 0.83323234, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.31542969, + "step": 1855, + "time_per_iteration": 3.0960283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_mlp": 1.07108843, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.05169641299516589, + "language_loss": 0.86845142, + "learning_rate": 0.0007447657851403155, + "loss": 0.87946558, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.30322266, + "step": 1856, + "time_per_iteration": 2.6420810222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106839, + "balance_loss_mlp": 1.07689333, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.07027910399075639, + "language_loss": 0.78771162, + "learning_rate": 0.0007444940772445915, + "loss": 0.79878008, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.29907227, + "step": 1857, + "time_per_iteration": 2.748770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109389, + "balance_loss_mlp": 1.06420684, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.057407361829253975, + "language_loss": 0.80228555, + "learning_rate": 0.0007442222744278484, + "loss": 0.81322443, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.29663086, + "step": 1858, + "time_per_iteration": 2.652111530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094475, + "balance_loss_mlp": 1.06410074, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.045384089682170406, + "language_loss": 0.8399753, + "learning_rate": 0.0007439503767956099, + "loss": 0.85092002, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.30371094, + "step": 1859, + "time_per_iteration": 2.703261375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03111064, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.02493030642290896, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80715972, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.1328125, + "step": 1860, + "time_per_iteration": 4.983760833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092897, + "balance_loss_mlp": 1.06242704, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.05045998946960442, + "language_loss": 0.85959804, + "learning_rate": 0.000743406297506922, + "loss": 0.87052703, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.30419922, + "step": 1861, + "time_per_iteration": 2.740078926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090008, + "balance_loss_mlp": 1.05956221, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.05968554082553822, + "language_loss": 0.8392486, + "learning_rate": 0.0007431341160617031, + "loss": 0.85014868, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.30395508, + "step": 1862, + "time_per_iteration": 2.8886373043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076671, + "balance_loss_mlp": 1.04631984, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.053643840261235066, + "language_loss": 0.88015211, + "learning_rate": 0.0007428618402234491, + "loss": 0.89091879, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.30297852, + "step": 1863, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04334283, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.062332671108041963, + "language_loss": 0.80358481, + "learning_rate": 0.0007425894700978668, + "loss": 0.81432676, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.30810547, + "step": 1864, + "time_per_iteration": 2.7334656715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072556, + "balance_loss_mlp": 1.04101336, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.050645747658019255, + "language_loss": 0.79510379, + "learning_rate": 0.0007423170057906996, + "loss": 0.80582935, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.31542969, + "step": 1865, + "time_per_iteration": 3.8669073581695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076041, + "balance_loss_mlp": 1.04452205, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06345597879427126, + "language_loss": 0.86289865, + "learning_rate": 0.0007420444474077275, + "loss": 0.87365907, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.31518555, + "step": 1866, + "time_per_iteration": 2.5648367404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080689, + "balance_loss_mlp": 1.04878831, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.058480526362169126, + "language_loss": 0.89744091, + "learning_rate": 0.0007417717950547671, + "loss": 0.90824777, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.31884766, + "step": 1867, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074714, + "balance_loss_mlp": 1.0600276, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.04131149216661822, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77071321, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.14648438, + "step": 1868, + "time_per_iteration": 4.900072813034058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.06035757, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.04948067344873762, + "language_loss": 0.84714514, + "learning_rate": 0.0007412262088623299, + "loss": 0.85806173, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.31274414, + "step": 1869, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_mlp": 1.06255615, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.0631690153505957, + "language_loss": 0.79514921, + "learning_rate": 0.0007409532752346684, + "loss": 0.80607969, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.30444336, + "step": 1870, + "time_per_iteration": 2.646813154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05436683, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.05200384527654752, + "language_loss": 0.88430232, + "learning_rate": 0.0007406802480606491, + "loss": 0.89514613, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.29956055, + "step": 1871, + "time_per_iteration": 2.6335039138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088571, + "balance_loss_mlp": 1.05819631, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.058340376963862656, + "language_loss": 0.90469301, + "learning_rate": 0.0007404071274462707, + "loss": 0.91557872, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.3034668, + "step": 1872, + "time_per_iteration": 2.579155206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088392, + "balance_loss_mlp": 1.05911398, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06288764850432389, + "language_loss": 0.83945811, + "learning_rate": 0.0007401339134975682, + "loss": 0.85034204, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.29272461, + "step": 1873, + "time_per_iteration": 2.6590254306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089736, + "balance_loss_mlp": 1.06024313, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.07025897777145818, + "language_loss": 0.84501064, + "learning_rate": 0.0007398606063206122, + "loss": 0.85590804, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.29467773, + "step": 1874, + "time_per_iteration": 2.6330654621124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_mlp": 1.05545354, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05525815693458704, + "language_loss": 0.78668261, + "learning_rate": 0.0007395872060215101, + "loss": 0.79753017, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.29296875, + "step": 1875, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_mlp": 1.05853248, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.05566722247490556, + "language_loss": 0.88191175, + "learning_rate": 0.0007393137127064056, + "loss": 0.89278299, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.28588867, + "step": 1876, + "time_per_iteration": 2.67520809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_mlp": 1.05479455, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05183280051917729, + "language_loss": 0.84175742, + "learning_rate": 0.0007390401264814779, + "loss": 0.85258996, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.28491211, + "step": 1877, + "time_per_iteration": 2.621708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05559897, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.059598774698536174, + "language_loss": 0.84762645, + "learning_rate": 0.0007387664474529427, + "loss": 0.85846466, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.28222656, + "step": 1878, + "time_per_iteration": 2.64604115486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_mlp": 1.0567776, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.05278661870548292, + "language_loss": 0.90893793, + "learning_rate": 0.0007384926757270518, + "loss": 0.91979533, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.28955078, + "step": 1879, + "time_per_iteration": 2.63849139213562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094605, + "balance_loss_mlp": 1.0652554, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.05095981973878578, + "language_loss": 0.79965544, + "learning_rate": 0.0007382188114100924, + "loss": 0.81060153, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.29296875, + "step": 1880, + "time_per_iteration": 2.967137098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_mlp": 1.06731534, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.0523610100033388, + "language_loss": 0.81541228, + "learning_rate": 0.0007379448546083884, + "loss": 0.82638228, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.29663086, + "step": 1881, + "time_per_iteration": 2.935075283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089574, + "balance_loss_mlp": 1.06036723, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.056326792126263736, + "language_loss": 0.88131809, + "learning_rate": 0.0007376708054282992, + "loss": 0.89221382, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.29174805, + "step": 1882, + "time_per_iteration": 2.9548256397247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080549, + "balance_loss_mlp": 1.05074644, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.053377968629185854, + "language_loss": 0.8395232, + "learning_rate": 0.0007373966639762201, + "loss": 0.85032874, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.29785156, + "step": 1883, + "time_per_iteration": 2.5978147983551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_mlp": 1.05085516, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.055969169447774005, + "language_loss": 0.88542271, + "learning_rate": 0.0007371224303585822, + "loss": 0.8962214, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.29003906, + "step": 1884, + "time_per_iteration": 2.573521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122192, + "balance_loss_mlp": 1.10817313, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.05390094690370155, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81479263, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.140625, + "step": 1885, + "time_per_iteration": 4.762617826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077599, + "balance_loss_mlp": 1.04722452, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05279204841925659, + "language_loss": 0.8277564, + "learning_rate": 0.0007365736870525335, + "loss": 0.83853239, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.30322266, + "step": 1886, + "time_per_iteration": 2.8206799030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071958, + "balance_loss_mlp": 1.04182231, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.0631822735743998, + "language_loss": 0.82252121, + "learning_rate": 0.000736299177577164, + "loss": 0.83324087, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.30102539, + "step": 1887, + "time_per_iteration": 2.5644423961639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075611, + "balance_loss_mlp": 1.04516482, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.06952119877485304, + "language_loss": 0.83928037, + "learning_rate": 0.0007360245763623174, + "loss": 0.8500365, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.30395508, + "step": 1888, + "time_per_iteration": 2.68868088722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_mlp": 1.04614949, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.05500458280543127, + "language_loss": 0.89759338, + "learning_rate": 0.0007357498835146039, + "loss": 0.90835977, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.30444336, + "step": 1889, + "time_per_iteration": 2.841135263442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078037, + "balance_loss_mlp": 1.04716182, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.05518095134274227, + "language_loss": 0.86945391, + "learning_rate": 0.0007354750991406684, + "loss": 0.8802343, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.30834961, + "step": 1890, + "time_per_iteration": 2.6954762935638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04810333, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.060964398763012274, + "language_loss": 0.80524838, + "learning_rate": 0.0007352002233471919, + "loss": 0.81604487, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.31518555, + "step": 1891, + "time_per_iteration": 2.6167404651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04973292, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.06807309201777603, + "language_loss": 0.79092562, + "learning_rate": 0.0007349252562408906, + "loss": 0.80172026, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.296875, + "step": 1892, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091379, + "balance_loss_mlp": 1.06071806, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.05563142804906438, + "language_loss": 0.81399196, + "learning_rate": 0.0007346501979285158, + "loss": 0.82490575, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.30615234, + "step": 1893, + "time_per_iteration": 2.8852903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_mlp": 1.06208813, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02944776437417564, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8161397, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.12792969, + "step": 1894, + "time_per_iteration": 4.784174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114227, + "balance_loss_mlp": 1.0819447, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.051755500006301046, + "language_loss": 0.8558799, + "learning_rate": 0.0007340998081127308, + "loss": 0.86702216, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.32275391, + "step": 1895, + "time_per_iteration": 2.807494878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121943, + "balance_loss_mlp": 1.09023345, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.06567695066031824, + "language_loss": 0.90748346, + "learning_rate": 0.0007338244768230007, + "loss": 0.9187029, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.31689453, + "step": 1896, + "time_per_iteration": 2.7678794860839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118221, + "balance_loss_mlp": 1.08694077, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.07782470610585689, + "language_loss": 0.8913762, + "learning_rate": 0.0007335490547545578, + "loss": 0.90255845, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.3125, + "step": 1897, + "time_per_iteration": 3.0801138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112607, + "balance_loss_mlp": 1.0822562, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.05264242736204855, + "language_loss": 0.82653165, + "learning_rate": 0.0007332735420143308, + "loss": 0.83765769, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.30297852, + "step": 1898, + "time_per_iteration": 2.7581489086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094572, + "balance_loss_mlp": 1.06338716, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.06387883695900265, + "language_loss": 0.8681283, + "learning_rate": 0.0007329979387092826, + "loss": 0.87907398, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.31152344, + "step": 1899, + "time_per_iteration": 2.586489677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.05964673, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.054083416077733606, + "language_loss": 0.83626556, + "learning_rate": 0.0007327222449464124, + "loss": 0.84716845, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.3059082, + "step": 1900, + "time_per_iteration": 3.2495076656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_mlp": 1.0518986, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.05500564094416643, + "language_loss": 0.88598847, + "learning_rate": 0.0007324464608327538, + "loss": 0.89683151, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.32397461, + "step": 1901, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079363, + "balance_loss_mlp": 1.04786777, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.0538418205513684, + "language_loss": 0.88291639, + "learning_rate": 0.0007321705864753758, + "loss": 0.89371002, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.31469727, + "step": 1902, + "time_per_iteration": 2.69343638420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04294717, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.056477009868628435, + "language_loss": 0.84098166, + "learning_rate": 0.0007318946219813823, + "loss": 0.85172582, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.31469727, + "step": 1903, + "time_per_iteration": 3.010847568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04232407, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05768945263904951, + "language_loss": 0.89714533, + "learning_rate": 0.000731618567457912, + "loss": 0.90789449, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.32592773, + "step": 1904, + "time_per_iteration": 2.6410703659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076588, + "balance_loss_mlp": 1.0440681, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05570087619571841, + "language_loss": 0.86445332, + "learning_rate": 0.000731342423012139, + "loss": 0.87521917, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.32519531, + "step": 1905, + "time_per_iteration": 3.054703712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.04312992, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.05663901457074664, + "language_loss": 0.82393479, + "learning_rate": 0.0007310661887512722, + "loss": 0.83468342, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.31713867, + "step": 1906, + "time_per_iteration": 3.0096654891967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076944, + "balance_loss_mlp": 1.04532969, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.07427377535541638, + "language_loss": 0.8207258, + "learning_rate": 0.0007307898647825549, + "loss": 0.83149529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.31591797, + "step": 1907, + "time_per_iteration": 2.67525315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04347432, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.07021562329929035, + "language_loss": 0.89152002, + "learning_rate": 0.0007305134512132659, + "loss": 0.90227735, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.32250977, + "step": 1908, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0476923, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.07878350898766671, + "language_loss": 0.83255082, + "learning_rate": 0.0007302369481507183, + "loss": 0.84334129, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.31323242, + "step": 1909, + "time_per_iteration": 2.5106606483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108859, + "balance_loss_mlp": 1.09207463, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.039316944601114644, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.8107062, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.16796875, + "step": 1910, + "time_per_iteration": 4.845642566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_mlp": 1.04287899, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.05282525969479425, + "language_loss": 0.8551507, + "learning_rate": 0.000729683673975274, + "loss": 0.86588871, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.30883789, + "step": 1911, + "time_per_iteration": 2.643991470336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077837, + "balance_loss_mlp": 1.04648542, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.06579029503933971, + "language_loss": 0.83071077, + "learning_rate": 0.0007294069030771774, + "loss": 0.84148908, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.31323242, + "step": 1912, + "time_per_iteration": 3.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081127, + "balance_loss_mlp": 1.05053759, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055639286508135585, + "language_loss": 0.90529931, + "learning_rate": 0.0007291300431154224, + "loss": 0.91611063, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.30541992, + "step": 1913, + "time_per_iteration": 2.6364145278930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020102, + "balance_loss_mlp": 1.00503433, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.014819520409209537, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71409839, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.15039062, + "step": 1914, + "time_per_iteration": 4.986552000045776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089166, + "balance_loss_mlp": 1.05895889, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.07166131614104637, + "language_loss": 0.80129957, + "learning_rate": 0.0007285760564309179, + "loss": 0.81219125, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.30151367, + "step": 1915, + "time_per_iteration": 3.105180025100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.05362058, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.07315246202889085, + "language_loss": 0.85023272, + "learning_rate": 0.0007282989299232448, + "loss": 0.86106199, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.29272461, + "step": 1916, + "time_per_iteration": 3.0501549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_mlp": 1.05710506, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.0682472178493412, + "language_loss": 0.83468378, + "learning_rate": 0.0007280217147820668, + "loss": 0.84554267, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.28735352, + "step": 1917, + "time_per_iteration": 2.61570143699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.06836295, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.06368361877082852, + "language_loss": 0.79183483, + "learning_rate": 0.0007277444111150079, + "loss": 0.80280429, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.28613281, + "step": 1918, + "time_per_iteration": 2.7004950046539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_mlp": 1.06124449, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.07280537378335762, + "language_loss": 0.84052753, + "learning_rate": 0.0007274670190297272, + "loss": 0.85142708, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.28710938, + "step": 1919, + "time_per_iteration": 2.598128080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06902122, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.05243134255501039, + "language_loss": 0.82081646, + "learning_rate": 0.0007271895386339179, + "loss": 0.83180475, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.29736328, + "step": 1920, + "time_per_iteration": 2.7843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093148, + "balance_loss_mlp": 1.06360769, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.058714378397154585, + "language_loss": 0.83102447, + "learning_rate": 0.0007269119700353073, + "loss": 0.8419559, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.29492188, + "step": 1921, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_mlp": 1.06052053, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04695414461356542, + "language_loss": 0.84780574, + "learning_rate": 0.0007266343133416571, + "loss": 0.85869944, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.28833008, + "step": 1922, + "time_per_iteration": 2.779585361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065569, + "balance_loss_mlp": 1.05011928, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.04139595668748732, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78182483, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.15429688, + "step": 1923, + "time_per_iteration": 4.841213703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_mlp": 1.05591547, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.07673769099321799, + "language_loss": 0.84293365, + "learning_rate": 0.0007260787361004556, + "loss": 0.85378897, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.2956543, + "step": 1924, + "time_per_iteration": 2.5501017570495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_mlp": 1.00875258, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.01226438472350035, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74784565, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.14257812, + "step": 1925, + "time_per_iteration": 4.9058191776275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05040073, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.0733591012555623, + "language_loss": 0.87266588, + "learning_rate": 0.0007255228077730903, + "loss": 0.88345671, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.28686523, + "step": 1926, + "time_per_iteration": 2.6776785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080805, + "balance_loss_mlp": 1.05281413, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.05143591599053885, + "language_loss": 0.81313562, + "learning_rate": 0.0007252447122218632, + "loss": 0.82394373, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.2800293, + "step": 1927, + "time_per_iteration": 3.1710472106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_mlp": 1.04907489, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.07597924069729044, + "language_loss": 0.88653511, + "learning_rate": 0.0007249665292228834, + "loss": 0.89731288, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.28686523, + "step": 1928, + "time_per_iteration": 2.580092191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108352, + "balance_loss_mlp": 1.0547905, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.05796370091963761, + "language_loss": 0.8379482, + "learning_rate": 0.000724688258884151, + "loss": 0.84878337, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.28710938, + "step": 1929, + "time_per_iteration": 2.6322267055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_mlp": 1.05740142, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.049384577339976525, + "language_loss": 0.86327779, + "learning_rate": 0.0007244099013137002, + "loss": 0.87413883, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.28710938, + "step": 1930, + "time_per_iteration": 3.09224009513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_mlp": 1.05951214, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.06129670734370297, + "language_loss": 0.88767004, + "learning_rate": 0.0007241314566195993, + "loss": 0.89854914, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.28393555, + "step": 1931, + "time_per_iteration": 3.238381862640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094186, + "balance_loss_mlp": 1.06531322, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.05545779345638414, + "language_loss": 0.85434037, + "learning_rate": 0.0007238529249099496, + "loss": 0.86528224, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.28833008, + "step": 1932, + "time_per_iteration": 2.632279872894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159138, + "balance_loss_mlp": 1.1475507, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.054961579821259376, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79016018, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.11572266, + "step": 1933, + "time_per_iteration": 4.920037746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098131, + "balance_loss_mlp": 1.06902027, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.06411393233522368, + "language_loss": 0.80432916, + "learning_rate": 0.000723295600876581, + "loss": 0.81531054, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.29101562, + "step": 1934, + "time_per_iteration": 3.060438632965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_mlp": 1.06510615, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.054125512250282885, + "language_loss": 0.87856102, + "learning_rate": 0.0007230168087692344, + "loss": 0.88949579, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.28393555, + "step": 1935, + "time_per_iteration": 2.655176877975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095042, + "balance_loss_mlp": 1.06607461, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.053712544631880174, + "language_loss": 0.82501912, + "learning_rate": 0.0007227379300790839, + "loss": 0.83596957, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.28955078, + "step": 1936, + "time_per_iteration": 3.05722713470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_mlp": 1.05668318, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.05452705072121448, + "language_loss": 0.85148442, + "learning_rate": 0.0007224589649143997, + "loss": 0.86234665, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.29492188, + "step": 1937, + "time_per_iteration": 2.593818187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06021869, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08689315573767935, + "language_loss": 0.80660325, + "learning_rate": 0.0007221799133834861, + "loss": 0.81749392, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.28833008, + "step": 1938, + "time_per_iteration": 2.6238772869110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087089, + "balance_loss_mlp": 1.05869377, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.06550449761554421, + "language_loss": 0.81904262, + "learning_rate": 0.00072190077559468, + "loss": 0.8299135, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.28417969, + "step": 1939, + "time_per_iteration": 2.5338878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_mlp": 1.05649543, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.05171807924061888, + "language_loss": 0.89000612, + "learning_rate": 0.0007216215516563527, + "loss": 0.90086764, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.29589844, + "step": 1940, + "time_per_iteration": 2.717912435531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_mlp": 1.05449796, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.06398735943962416, + "language_loss": 0.83462608, + "learning_rate": 0.0007213422416769083, + "loss": 0.84545934, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.28808594, + "step": 1941, + "time_per_iteration": 2.6354072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107949, + "balance_loss_mlp": 1.0511179, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05310409823342424, + "language_loss": 0.75118601, + "learning_rate": 0.0007210628457647849, + "loss": 0.76198089, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.28369141, + "step": 1942, + "time_per_iteration": 2.573251724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080746, + "balance_loss_mlp": 1.05118251, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.05561530112530558, + "language_loss": 0.78689432, + "learning_rate": 0.000720783364028453, + "loss": 0.79770184, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.29516602, + "step": 1943, + "time_per_iteration": 2.782897472381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078848, + "balance_loss_mlp": 1.04935515, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05583674557333592, + "language_loss": 0.87426305, + "learning_rate": 0.0007205037965764177, + "loss": 0.88505149, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.29467773, + "step": 1944, + "time_per_iteration": 2.577195167541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_mlp": 1.04740369, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05970518460248593, + "language_loss": 0.8568424, + "learning_rate": 0.0007202241435172161, + "loss": 0.86760962, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.29296875, + "step": 1945, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04849827, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.057784843601785166, + "language_loss": 0.88219595, + "learning_rate": 0.0007199444049594198, + "loss": 0.89296943, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.28833008, + "step": 1946, + "time_per_iteration": 2.997744560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075997, + "balance_loss_mlp": 1.04681468, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.05996621635377081, + "language_loss": 0.83343232, + "learning_rate": 0.0007196645810116322, + "loss": 0.84419227, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.29150391, + "step": 1947, + "time_per_iteration": 2.6596434116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071198, + "balance_loss_mlp": 1.04308891, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.07792528533349045, + "language_loss": 0.8387686, + "learning_rate": 0.0007193846717824912, + "loss": 0.84948057, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.28149414, + "step": 1948, + "time_per_iteration": 2.87357759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04031014, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06284621907245236, + "language_loss": 0.88014293, + "learning_rate": 0.0007191046773806669, + "loss": 0.89082038, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.27514648, + "step": 1949, + "time_per_iteration": 2.616118907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073776, + "balance_loss_mlp": 1.04473686, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06080214721481266, + "language_loss": 0.83072305, + "learning_rate": 0.0007188245979148631, + "loss": 0.84146082, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.29003906, + "step": 1950, + "time_per_iteration": 3.212918281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05164886, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.06034460157863772, + "language_loss": 0.87560785, + "learning_rate": 0.0007185444334938157, + "loss": 0.88641185, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.28735352, + "step": 1951, + "time_per_iteration": 2.6847927570343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_mlp": 1.04635811, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.07362347851216991, + "language_loss": 0.85023165, + "learning_rate": 0.0007182641842262947, + "loss": 0.86097872, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.28320312, + "step": 1952, + "time_per_iteration": 2.6011481285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080682, + "balance_loss_mlp": 1.05252457, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.05143100601063952, + "language_loss": 0.77525514, + "learning_rate": 0.0007179838502211022, + "loss": 0.78606194, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.28198242, + "step": 1953, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.05487227, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.06528688845841664, + "language_loss": 0.86487108, + "learning_rate": 0.0007177034315870738, + "loss": 0.87569952, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.27978516, + "step": 1954, + "time_per_iteration": 2.9551377296447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04896057, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.059767476828271, + "language_loss": 0.90968794, + "learning_rate": 0.0007174229284330773, + "loss": 0.9204582, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.28076172, + "step": 1955, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.0481143, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.06317358450106399, + "language_loss": 0.87043428, + "learning_rate": 0.0007171423408680141, + "loss": 0.88119459, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.27954102, + "step": 1956, + "time_per_iteration": 2.8243377208709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.04352272, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.057758823731725896, + "language_loss": 0.89565909, + "learning_rate": 0.0007168616690008176, + "loss": 0.90638542, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.29125977, + "step": 1957, + "time_per_iteration": 2.6314306259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_mlp": 1.04572916, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.055146864479517985, + "language_loss": 0.86279052, + "learning_rate": 0.0007165809129404545, + "loss": 0.87353098, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.28320312, + "step": 1958, + "time_per_iteration": 2.7625439167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074993, + "balance_loss_mlp": 1.044595, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.06141204693847206, + "language_loss": 0.85977095, + "learning_rate": 0.0007163000727959239, + "loss": 0.87052089, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.30371094, + "step": 1959, + "time_per_iteration": 2.473407506942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061387, + "balance_loss_mlp": 1.04622388, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.02935416999593297, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79020452, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.15136719, + "step": 1960, + "time_per_iteration": 4.8784215450286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_mlp": 1.04973722, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.05722982355969982, + "language_loss": 0.84446192, + "learning_rate": 0.00071573814069052, + "loss": 0.85525477, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.29541016, + "step": 1961, + "time_per_iteration": 2.929955244064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_mlp": 1.05031538, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.053564242831421076, + "language_loss": 0.88053226, + "learning_rate": 0.0007154570489478081, + "loss": 0.8913213, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.28540039, + "step": 1962, + "time_per_iteration": 3.1691505908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079242, + "balance_loss_mlp": 1.05001187, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.05213464978332433, + "language_loss": 0.86570239, + "learning_rate": 0.0007151758735572514, + "loss": 0.87649477, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.29174805, + "step": 1963, + "time_per_iteration": 2.9893381595611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_mlp": 1.05190408, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06256473208381459, + "language_loss": 0.80730724, + "learning_rate": 0.0007148946146280119, + "loss": 0.81811094, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.28442383, + "step": 1964, + "time_per_iteration": 2.8270015716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_mlp": 1.00214851, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.01808471901321765, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73207271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12988281, + "step": 1965, + "time_per_iteration": 4.895836353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018206, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.021930840707602553, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76360154, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.12597656, + "step": 1966, + "time_per_iteration": 5.0023956298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091314, + "balance_loss_mlp": 1.06358576, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.04479252262380658, + "language_loss": 0.83477217, + "learning_rate": 0.0007140503377003022, + "loss": 0.84568524, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.27734375, + "step": 1967, + "time_per_iteration": 3.0142691135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097939, + "balance_loss_mlp": 1.07011509, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.049620821678558774, + "language_loss": 0.8500334, + "learning_rate": 0.000713768745708599, + "loss": 0.86101276, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.27856445, + "step": 1968, + "time_per_iteration": 2.6556408405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109518, + "balance_loss_mlp": 1.06807137, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.05249502952466034, + "language_loss": 0.7739228, + "learning_rate": 0.0007134870707245085, + "loss": 0.78487462, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.27148438, + "step": 1969, + "time_per_iteration": 3.2944319248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097317, + "balance_loss_mlp": 1.0706377, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06611086672726225, + "language_loss": 0.84358507, + "learning_rate": 0.0007132053128573864, + "loss": 0.85455823, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.26733398, + "step": 1970, + "time_per_iteration": 2.745910167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.07422984, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.07389156257299019, + "language_loss": 0.83986598, + "learning_rate": 0.0007129234722166211, + "loss": 0.8508774, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.26977539, + "step": 1971, + "time_per_iteration": 2.8552701473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095612, + "balance_loss_mlp": 1.06881404, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.0464186232668544, + "language_loss": 0.90731955, + "learning_rate": 0.0007126415489116328, + "loss": 0.91827571, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.26818848, + "step": 1972, + "time_per_iteration": 2.6738507747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089531, + "balance_loss_mlp": 1.06185079, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05397666452651625, + "language_loss": 0.81034803, + "learning_rate": 0.0007123595430518736, + "loss": 0.82124341, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.27685547, + "step": 1973, + "time_per_iteration": 2.8551318645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_mlp": 1.06225908, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07183677804285386, + "language_loss": 0.86159599, + "learning_rate": 0.0007120774547468282, + "loss": 0.87249249, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.27416992, + "step": 1974, + "time_per_iteration": 2.5466248989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091836, + "balance_loss_mlp": 1.06477594, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.057862181788604236, + "language_loss": 0.81643212, + "learning_rate": 0.0007117952841060128, + "loss": 0.82735044, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.27099609, + "step": 1975, + "time_per_iteration": 2.6863863468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010857, + "balance_loss_mlp": 1.05813885, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.06251241790432795, + "language_loss": 0.83861643, + "learning_rate": 0.0007115130312389756, + "loss": 0.84947342, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.27587891, + "step": 1976, + "time_per_iteration": 2.6821115016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088536, + "balance_loss_mlp": 1.0602119, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.063889045898505, + "language_loss": 0.79037011, + "learning_rate": 0.0007112306962552973, + "loss": 0.80125546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.28320312, + "step": 1977, + "time_per_iteration": 2.5958874225616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05877423, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055122671956433805, + "language_loss": 0.85178941, + "learning_rate": 0.0007109482792645896, + "loss": 0.8626554, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.27832031, + "step": 1978, + "time_per_iteration": 2.706073760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081892, + "balance_loss_mlp": 1.05363917, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06407360303991923, + "language_loss": 0.83617824, + "learning_rate": 0.0007106657803764969, + "loss": 0.84699714, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.2824707, + "step": 1979, + "time_per_iteration": 2.7429239749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078619, + "balance_loss_mlp": 1.05022287, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.07177583644367627, + "language_loss": 0.8165133, + "learning_rate": 0.0007103831997006948, + "loss": 0.82729954, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.28393555, + "step": 1980, + "time_per_iteration": 2.7360527515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072489, + "balance_loss_mlp": 1.04361689, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.06360208542685557, + "language_loss": 0.85186386, + "learning_rate": 0.0007101005373468908, + "loss": 0.86258882, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.28833008, + "step": 1981, + "time_per_iteration": 2.925529718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03775024, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.051682910059599525, + "language_loss": 0.86574209, + "learning_rate": 0.0007098177934248242, + "loss": 0.87640351, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.28369141, + "step": 1982, + "time_per_iteration": 2.7813186645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_mlp": 1.03770101, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.06153978169673806, + "language_loss": 0.85434651, + "learning_rate": 0.0007095349680442661, + "loss": 0.86501151, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.2878418, + "step": 1983, + "time_per_iteration": 2.878678321838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.04062414, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.05550499316869274, + "language_loss": 0.78828371, + "learning_rate": 0.0007092520613150188, + "loss": 0.79897726, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.28710938, + "step": 1984, + "time_per_iteration": 2.667602300643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04057729, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.04940974411679134, + "language_loss": 0.81105816, + "learning_rate": 0.0007089690733469165, + "loss": 0.82175809, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.29394531, + "step": 1985, + "time_per_iteration": 2.7445921897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077693, + "balance_loss_mlp": 1.04924965, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.0710841944315155, + "language_loss": 0.82154202, + "learning_rate": 0.000708686004249825, + "loss": 0.8323189, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.28442383, + "step": 1986, + "time_per_iteration": 2.803262948989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_mlp": 1.0459218, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053095768122865476, + "language_loss": 0.91283715, + "learning_rate": 0.0007084028541336413, + "loss": 0.92359161, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.29467773, + "step": 1987, + "time_per_iteration": 2.693894147872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_mlp": 1.04807711, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.04978295407195845, + "language_loss": 0.86100876, + "learning_rate": 0.0007081196231082942, + "loss": 0.87176782, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.27807617, + "step": 1988, + "time_per_iteration": 2.8127198219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05097318, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05417702481979702, + "language_loss": 0.80060172, + "learning_rate": 0.0007078363112837436, + "loss": 0.81139255, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.28125, + "step": 1989, + "time_per_iteration": 2.8839027881622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.04866838, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.05590772319077314, + "language_loss": 0.84895635, + "learning_rate": 0.000707552918769981, + "loss": 0.85972643, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.28344727, + "step": 1990, + "time_per_iteration": 2.4921815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075886, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.05219115858491499, + "language_loss": 0.8389315, + "learning_rate": 0.000707269445677029, + "loss": 0.84969032, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.27563477, + "step": 1991, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_mlp": 1.05205727, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.061454112768806295, + "language_loss": 0.85369635, + "learning_rate": 0.0007069858921149416, + "loss": 0.8645004, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.28344727, + "step": 1992, + "time_per_iteration": 2.953749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077015, + "balance_loss_mlp": 1.04919195, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.04324001999537677, + "language_loss": 0.86024761, + "learning_rate": 0.0007067022581938043, + "loss": 0.87101781, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.27880859, + "step": 1993, + "time_per_iteration": 2.818094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072064, + "balance_loss_mlp": 1.04502726, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06003802076808944, + "language_loss": 0.83055973, + "learning_rate": 0.0007064185440237334, + "loss": 0.84128034, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.27075195, + "step": 1994, + "time_per_iteration": 2.7304775714874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.05043745, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.054248337050939024, + "language_loss": 0.84367561, + "learning_rate": 0.0007061347497148764, + "loss": 0.85445797, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.27807617, + "step": 1995, + "time_per_iteration": 2.747483015060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074409, + "balance_loss_mlp": 1.04706264, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06054830939074019, + "language_loss": 0.86660719, + "learning_rate": 0.0007058508753774122, + "loss": 0.87735128, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.27392578, + "step": 1996, + "time_per_iteration": 2.6960108280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078362, + "balance_loss_mlp": 1.05165958, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.05196412840141252, + "language_loss": 0.86974967, + "learning_rate": 0.0007055669211215505, + "loss": 0.88053334, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.26733398, + "step": 1997, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076337, + "balance_loss_mlp": 1.04775071, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06669720231739994, + "language_loss": 0.77213579, + "learning_rate": 0.0007052828870575322, + "loss": 0.78289914, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.28588867, + "step": 1998, + "time_per_iteration": 2.6813313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_mlp": 1.05808222, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.053007093293579055, + "language_loss": 0.8636111, + "learning_rate": 0.0007049987732956291, + "loss": 0.87446344, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.27197266, + "step": 1999, + "time_per_iteration": 2.9743165969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.04323626, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.046114011394728885, + "language_loss": 0.82846403, + "learning_rate": 0.0007047145799461439, + "loss": 0.83917749, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.28149414, + "step": 2000, + "time_per_iteration": 2.85295033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_mlp": 1.0488013, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.06118237782788499, + "language_loss": 0.8185212, + "learning_rate": 0.00070443030711941, + "loss": 0.82929248, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.28295898, + "step": 2001, + "time_per_iteration": 2.7602195739746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.04918385, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.06801983854699947, + "language_loss": 0.82348108, + "learning_rate": 0.0007041459549257924, + "loss": 0.83426422, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.29101562, + "step": 2002, + "time_per_iteration": 2.8562166690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.04565787, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.07124544558687326, + "language_loss": 0.7826004, + "learning_rate": 0.0007038615234756859, + "loss": 0.79334354, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.28662109, + "step": 2003, + "time_per_iteration": 3.1888484954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_mlp": 1.0429796, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.060193135665447615, + "language_loss": 0.83578098, + "learning_rate": 0.000703577012879517, + "loss": 0.8464973, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.28662109, + "step": 2004, + "time_per_iteration": 2.6438684463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069967, + "balance_loss_mlp": 1.04185688, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.05830751128665357, + "language_loss": 0.8852784, + "learning_rate": 0.0007032924232477423, + "loss": 0.89597809, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.28149414, + "step": 2005, + "time_per_iteration": 2.6632285118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071337, + "balance_loss_mlp": 1.04253602, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.05522600702951118, + "language_loss": 0.8025552, + "learning_rate": 0.0007030077546908493, + "loss": 0.81326854, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.28808594, + "step": 2006, + "time_per_iteration": 2.6748647689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06600749, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.04192005891791234, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84142971, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12255859, + "step": 2007, + "time_per_iteration": 4.758062124252319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084632, + "balance_loss_mlp": 1.05614078, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.06495221526254255, + "language_loss": 0.79320729, + "learning_rate": 0.0007024381812438117, + "loss": 0.80405354, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.28515625, + "step": 2008, + "time_per_iteration": 2.557239532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095356, + "balance_loss_mlp": 1.06607771, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.09570560546772983, + "language_loss": 0.83017313, + "learning_rate": 0.0007021532765747951, + "loss": 0.84112668, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.29248047, + "step": 2009, + "time_per_iteration": 2.984100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.06031561, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05400711762269546, + "language_loss": 0.78963518, + "learning_rate": 0.0007018682934229162, + "loss": 0.80052131, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.28295898, + "step": 2010, + "time_per_iteration": 2.9302892684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080883, + "balance_loss_mlp": 1.05220175, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05212566321061033, + "language_loss": 0.82523775, + "learning_rate": 0.0007015832318988152, + "loss": 0.83604658, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.28662109, + "step": 2011, + "time_per_iteration": 2.65934157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_mlp": 1.0158205, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.016832038405886617, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74917436, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11523438, + "step": 2012, + "time_per_iteration": 4.964378595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076687, + "balance_loss_mlp": 1.04776716, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.05730560331399072, + "language_loss": 0.83868068, + "learning_rate": 0.0007010128741766604, + "loss": 0.84944755, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.28857422, + "step": 2013, + "time_per_iteration": 2.7196977138519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_mlp": 1.04005277, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.0608937159393576, + "language_loss": 0.843593, + "learning_rate": 0.0007007275782000391, + "loss": 0.85428894, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.29492188, + "step": 2014, + "time_per_iteration": 2.635704517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.04351759, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.061731808628827385, + "language_loss": 0.84906852, + "learning_rate": 0.0007004422042940605, + "loss": 0.85979199, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.2878418, + "step": 2015, + "time_per_iteration": 2.500502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072405, + "balance_loss_mlp": 1.04246008, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.06410146749924231, + "language_loss": 0.89413089, + "learning_rate": 0.0007001567525695169, + "loss": 0.90485489, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.29931641, + "step": 2016, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072622, + "balance_loss_mlp": 1.04410672, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.057933083917186774, + "language_loss": 0.83612067, + "learning_rate": 0.0006998712231372303, + "loss": 0.84684694, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.28491211, + "step": 2017, + "time_per_iteration": 3.0175724029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04141831, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.04866320553491467, + "language_loss": 0.86211008, + "learning_rate": 0.0006995856161080532, + "loss": 0.87281585, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.29101562, + "step": 2018, + "time_per_iteration": 2.879014015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071313, + "balance_loss_mlp": 1.04193974, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.05910223086818918, + "language_loss": 0.81994784, + "learning_rate": 0.0006992999315928679, + "loss": 0.83066106, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.29345703, + "step": 2019, + "time_per_iteration": 2.794605255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078638, + "balance_loss_mlp": 1.04826391, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.0551019421553566, + "language_loss": 0.86098075, + "learning_rate": 0.0006990141697025871, + "loss": 0.8717671, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.3034668, + "step": 2020, + "time_per_iteration": 2.808492422103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_mlp": 1.04388523, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.03291843471702338, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77415681, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12158203, + "step": 2021, + "time_per_iteration": 4.747381687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04109025, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.0700535467402408, + "language_loss": 0.82436341, + "learning_rate": 0.0006984424142405392, + "loss": 0.83506376, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.28930664, + "step": 2022, + "time_per_iteration": 2.8081154823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070367, + "balance_loss_mlp": 1.04144704, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06604387927811756, + "language_loss": 0.81889653, + "learning_rate": 0.0006981564208907474, + "loss": 0.82960021, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.2890625, + "step": 2023, + "time_per_iteration": 2.615868091583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067731, + "balance_loss_mlp": 1.03947854, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.05337785231387105, + "language_loss": 0.90169919, + "learning_rate": 0.0006978703506098102, + "loss": 0.91237652, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.2824707, + "step": 2024, + "time_per_iteration": 2.7487242221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04292357, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.05102180718564601, + "language_loss": 0.87631416, + "learning_rate": 0.00069758420350879, + "loss": 0.88702166, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.27832031, + "step": 2025, + "time_per_iteration": 2.6278607845306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03802657, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.05496821729843788, + "language_loss": 0.85941356, + "learning_rate": 0.000697297979698779, + "loss": 0.87007421, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.28051758, + "step": 2026, + "time_per_iteration": 2.773711919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072256, + "balance_loss_mlp": 1.0449574, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.054849440695872026, + "language_loss": 0.83735013, + "learning_rate": 0.0006970116792908992, + "loss": 0.84807271, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.27368164, + "step": 2027, + "time_per_iteration": 3.1274263858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071715, + "balance_loss_mlp": 1.04348612, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.0501662810644282, + "language_loss": 0.80959415, + "learning_rate": 0.000696725302396302, + "loss": 0.82031131, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.28222656, + "step": 2028, + "time_per_iteration": 2.653289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078388, + "balance_loss_mlp": 1.050946, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.053195529027894116, + "language_loss": 0.85790342, + "learning_rate": 0.0006964388491261692, + "loss": 0.86868727, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.2746582, + "step": 2029, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082882, + "balance_loss_mlp": 1.0550828, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.06114884672927749, + "language_loss": 0.87352717, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435602, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.27832031, + "step": 2030, + "time_per_iteration": 2.8415944576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083514, + "balance_loss_mlp": 1.0548079, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.056999957489140544, + "language_loss": 0.78065526, + "learning_rate": 0.0006958657139041696, + "loss": 0.79149044, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.28686523, + "step": 2031, + "time_per_iteration": 2.750596761703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_mlp": 1.01660919, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.015090316928766313, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77740502, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.109375, + "step": 2032, + "time_per_iteration": 4.916932106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080774, + "balance_loss_mlp": 1.05371356, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.058882626995900515, + "language_loss": 0.77978921, + "learning_rate": 0.0006952922745149434, + "loss": 0.7905969, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.27099609, + "step": 2033, + "time_per_iteration": 2.6288254261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076329, + "balance_loss_mlp": 1.04802871, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.059683993490508125, + "language_loss": 0.8774389, + "learning_rate": 0.000695005441035888, + "loss": 0.88820225, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.28295898, + "step": 2034, + "time_per_iteration": 2.6451032161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021075, + "balance_loss_mlp": 1.01001287, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.012767183735830537, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74744511, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11083984, + "step": 2035, + "time_per_iteration": 4.875540018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05346835, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.05871453648610719, + "language_loss": 0.8120997, + "learning_rate": 0.0006944315470656863, + "loss": 0.82291067, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.27685547, + "step": 2036, + "time_per_iteration": 2.9991486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079422, + "balance_loss_mlp": 1.05193281, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05954449002694624, + "language_loss": 0.90806162, + "learning_rate": 0.000694144486797345, + "loss": 0.91885585, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.27539062, + "step": 2037, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016452, + "balance_loss_mlp": 1.00543678, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.010331538207496795, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80536884, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.11035156, + "step": 2038, + "time_per_iteration": 4.696615695953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077334, + "balance_loss_mlp": 1.04920101, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.05886678367995608, + "language_loss": 0.89078939, + "learning_rate": 0.0006935701402514156, + "loss": 0.90156269, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.28149414, + "step": 2039, + "time_per_iteration": 2.555340051651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00254571, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.009976601144167605, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74048454, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.11035156, + "step": 2040, + "time_per_iteration": 4.91499400138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04941869, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.0656092448350418, + "language_loss": 0.84421289, + "learning_rate": 0.0006929954931031422, + "loss": 0.8549906, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.28344727, + "step": 2041, + "time_per_iteration": 3.729060649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_mlp": 1.0521127, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05672023255092622, + "language_loss": 0.88579351, + "learning_rate": 0.0006927080570819805, + "loss": 0.8965857, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.27148438, + "step": 2042, + "time_per_iteration": 2.5964105129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05557048, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.07129276434353096, + "language_loss": 0.81115568, + "learning_rate": 0.0006924205462449161, + "loss": 0.82197881, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.26806641, + "step": 2043, + "time_per_iteration": 2.585873603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080679, + "balance_loss_mlp": 1.0537734, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.07610386660927036, + "language_loss": 0.8177464, + "learning_rate": 0.0006921329607035702, + "loss": 0.8285532, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.26940918, + "step": 2044, + "time_per_iteration": 3.238981246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087504, + "balance_loss_mlp": 1.0611347, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.0570655681013956, + "language_loss": 0.87757248, + "learning_rate": 0.0006918453005695938, + "loss": 0.88844752, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.26416016, + "step": 2045, + "time_per_iteration": 2.6602108478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_mlp": 1.06491971, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.055879562404771856, + "language_loss": 0.84307766, + "learning_rate": 0.0006915575659546662, + "loss": 0.85398793, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.26147461, + "step": 2046, + "time_per_iteration": 2.6592600345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_mlp": 1.06476951, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.06494345942268129, + "language_loss": 0.80426449, + "learning_rate": 0.0006912697569704959, + "loss": 0.81517833, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.26623535, + "step": 2047, + "time_per_iteration": 2.613070011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080678, + "balance_loss_mlp": 1.0539515, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.06871552578761372, + "language_loss": 0.86815077, + "learning_rate": 0.0006909818737288205, + "loss": 0.87895757, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.26745605, + "step": 2048, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05919969, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.055462609864315775, + "language_loss": 0.80754077, + "learning_rate": 0.000690693916341406, + "loss": 0.81840289, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.27075195, + "step": 2049, + "time_per_iteration": 2.668114185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_mlp": 1.0532347, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.05123788091691057, + "language_loss": 0.8241666, + "learning_rate": 0.0006904058849200475, + "loss": 0.83496863, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.27001953, + "step": 2050, + "time_per_iteration": 2.7161009311676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084281, + "balance_loss_mlp": 1.05679107, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.06391064418382593, + "language_loss": 0.84741384, + "learning_rate": 0.0006901177795765683, + "loss": 0.8582567, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.27514648, + "step": 2051, + "time_per_iteration": 2.6012356281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082278, + "balance_loss_mlp": 1.05540872, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.059538956745971455, + "language_loss": 0.8114661, + "learning_rate": 0.0006898296004228213, + "loss": 0.82228893, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.26879883, + "step": 2052, + "time_per_iteration": 2.739016056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091682, + "balance_loss_mlp": 1.07909358, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.0435951911950544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79218423, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12597656, + "step": 2053, + "time_per_iteration": 4.853093385696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.0498004, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.061585922129253, + "language_loss": 0.79790258, + "learning_rate": 0.0006892530211320763, + "loss": 0.80867237, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.2722168, + "step": 2054, + "time_per_iteration": 2.695810317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_mlp": 1.05135143, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06739666157176663, + "language_loss": 0.83483803, + "learning_rate": 0.000688964621218926, + "loss": 0.84561741, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.26611328, + "step": 2055, + "time_per_iteration": 2.5957767963409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04496288, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05900978816729325, + "language_loss": 0.79760778, + "learning_rate": 0.0006886761479432037, + "loss": 0.80831754, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.26037598, + "step": 2056, + "time_per_iteration": 2.823195457458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.0479672, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.06325658180551426, + "language_loss": 0.84495139, + "learning_rate": 0.0006883876014169045, + "loss": 0.85570216, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.27148438, + "step": 2057, + "time_per_iteration": 2.504899263381958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05080771, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05952155235087993, + "language_loss": 0.90666497, + "learning_rate": 0.000688098981752052, + "loss": 0.91744673, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.27441406, + "step": 2058, + "time_per_iteration": 2.705845832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079753, + "balance_loss_mlp": 1.05207229, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.057037005783434964, + "language_loss": 0.80068249, + "learning_rate": 0.0006878102890606982, + "loss": 0.81147999, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.27709961, + "step": 2059, + "time_per_iteration": 3.086745500564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108134, + "balance_loss_mlp": 1.0542556, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.07822530462482143, + "language_loss": 0.80866635, + "learning_rate": 0.0006875215234549239, + "loss": 0.8194797, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.27124023, + "step": 2060, + "time_per_iteration": 2.5814599990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080083, + "balance_loss_mlp": 1.05221188, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.06673254145899743, + "language_loss": 0.85142004, + "learning_rate": 0.0006872326850468376, + "loss": 0.86222088, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.27880859, + "step": 2061, + "time_per_iteration": 2.6693742275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081472, + "balance_loss_mlp": 1.05343366, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.06184749895138045, + "language_loss": 0.78875667, + "learning_rate": 0.0006869437739485762, + "loss": 0.79957139, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.28051758, + "step": 2062, + "time_per_iteration": 2.612020969390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108316, + "balance_loss_mlp": 1.05493176, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.07174128592683177, + "language_loss": 0.92295337, + "learning_rate": 0.0006866547902723053, + "loss": 0.93378496, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.2824707, + "step": 2063, + "time_per_iteration": 2.676013469696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_mlp": 1.05300224, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05898261192449876, + "language_loss": 0.80094039, + "learning_rate": 0.000686365734130218, + "loss": 0.81175387, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.28369141, + "step": 2064, + "time_per_iteration": 2.7021024227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071448, + "balance_loss_mlp": 1.0426228, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.09101918864834832, + "language_loss": 0.83948302, + "learning_rate": 0.000686076605634536, + "loss": 0.85019755, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.28808594, + "step": 2065, + "time_per_iteration": 2.6558356285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068247, + "balance_loss_mlp": 1.03963661, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.05840936356543045, + "language_loss": 0.83999312, + "learning_rate": 0.0006857874048975088, + "loss": 0.85067558, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.28613281, + "step": 2066, + "time_per_iteration": 2.556900978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068316, + "balance_loss_mlp": 1.04027796, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.07585091480167282, + "language_loss": 0.87176585, + "learning_rate": 0.0006854981320314142, + "loss": 0.88244903, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.28027344, + "step": 2067, + "time_per_iteration": 2.445798635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04426003, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.08763476788371415, + "language_loss": 0.86982906, + "learning_rate": 0.0006852087871485579, + "loss": 0.88055265, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.28125, + "step": 2068, + "time_per_iteration": 2.6390161514282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_mlp": 1.04861069, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.065510260101048, + "language_loss": 0.82088625, + "learning_rate": 0.0006849193703612735, + "loss": 0.83165061, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.27856445, + "step": 2069, + "time_per_iteration": 2.763023614883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_mlp": 1.04346275, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.058439166966186944, + "language_loss": 0.77565378, + "learning_rate": 0.0006846298817819225, + "loss": 0.78636372, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.27563477, + "step": 2070, + "time_per_iteration": 2.948054790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070331, + "balance_loss_mlp": 1.04296088, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.06370866866163034, + "language_loss": 0.80921137, + "learning_rate": 0.0006843403215228945, + "loss": 0.8199147, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.27392578, + "step": 2071, + "time_per_iteration": 2.440274953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075017, + "balance_loss_mlp": 1.04771829, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.05754797735781241, + "language_loss": 0.80491692, + "learning_rate": 0.0006840506896966065, + "loss": 0.81566709, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.2734375, + "step": 2072, + "time_per_iteration": 2.7141849994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076402, + "balance_loss_mlp": 1.04874492, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.06436648215160112, + "language_loss": 0.82351565, + "learning_rate": 0.0006837609864155038, + "loss": 0.83427966, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.27685547, + "step": 2073, + "time_per_iteration": 2.8728160858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107952, + "balance_loss_mlp": 1.05267441, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.06075069456973031, + "language_loss": 0.83255166, + "learning_rate": 0.0006834712117920592, + "loss": 0.84334683, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.26855469, + "step": 2074, + "time_per_iteration": 2.6078460216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081959, + "balance_loss_mlp": 1.05458879, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.08105254072349301, + "language_loss": 0.85028476, + "learning_rate": 0.0006831813659387729, + "loss": 0.86110437, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.27416992, + "step": 2075, + "time_per_iteration": 2.5435502529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080066, + "balance_loss_mlp": 1.05236197, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05543733258884828, + "language_loss": 0.84105802, + "learning_rate": 0.0006828914489681733, + "loss": 0.85185862, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.27758789, + "step": 2076, + "time_per_iteration": 2.716728687286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_mlp": 1.05186319, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05894989539880716, + "language_loss": 0.8515023, + "learning_rate": 0.0006826014609928162, + "loss": 0.86230129, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.28027344, + "step": 2077, + "time_per_iteration": 2.740797996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_mlp": 1.02490366, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.025465037646940157, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84235638, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.11328125, + "step": 2078, + "time_per_iteration": 4.832703590393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.05287147, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.11662193334808049, + "language_loss": 0.8017869, + "learning_rate": 0.0006820212724781896, + "loss": 0.81259406, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.27880859, + "step": 2079, + "time_per_iteration": 2.6742663383483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076717, + "balance_loss_mlp": 1.0488224, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.08177152300224107, + "language_loss": 0.83806193, + "learning_rate": 0.0006817310721641694, + "loss": 0.84882903, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.27905273, + "step": 2080, + "time_per_iteration": 2.8349008560180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076923, + "balance_loss_mlp": 1.04929078, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.06565277329590896, + "language_loss": 0.84214735, + "learning_rate": 0.00068144080129589, + "loss": 0.8529166, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.27685547, + "step": 2081, + "time_per_iteration": 2.6278159618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_mlp": 1.05710232, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05776018351639151, + "language_loss": 0.82856774, + "learning_rate": 0.0006811504599860441, + "loss": 0.83941126, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.27294922, + "step": 2082, + "time_per_iteration": 2.569265365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088899, + "balance_loss_mlp": 1.06140924, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.07401045054208001, + "language_loss": 0.85797036, + "learning_rate": 0.0006808600483473526, + "loss": 0.86885935, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.27490234, + "step": 2083, + "time_per_iteration": 2.8923354148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.05170512, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.06499053200862517, + "language_loss": 0.86023808, + "learning_rate": 0.0006805695664925629, + "loss": 0.87103558, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.28027344, + "step": 2084, + "time_per_iteration": 2.8025314807891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.05461943, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.06817943175075042, + "language_loss": 0.8386181, + "learning_rate": 0.0006802790145344506, + "loss": 0.84944773, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.28344727, + "step": 2085, + "time_per_iteration": 2.5035839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075393, + "balance_loss_mlp": 1.04725957, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.06401081868364573, + "language_loss": 0.87169802, + "learning_rate": 0.0006799883925858176, + "loss": 0.88245201, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.28125, + "step": 2086, + "time_per_iteration": 2.8827152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088527, + "balance_loss_mlp": 1.05989313, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06559731004413262, + "language_loss": 0.85316324, + "learning_rate": 0.0006796977007594933, + "loss": 0.86404848, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.28637695, + "step": 2087, + "time_per_iteration": 2.5959601402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094266, + "balance_loss_mlp": 1.06553721, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.12268552055269868, + "language_loss": 0.86342102, + "learning_rate": 0.0006794069391683345, + "loss": 0.87436372, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.28710938, + "step": 2088, + "time_per_iteration": 2.7393155097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089464, + "balance_loss_mlp": 1.06087732, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.0717880154934153, + "language_loss": 0.80560589, + "learning_rate": 0.0006791161079252248, + "loss": 0.81650054, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.28588867, + "step": 2089, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06879497, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.06954460778471602, + "language_loss": 0.8248291, + "learning_rate": 0.0006788252071430747, + "loss": 0.83581454, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.29711914, + "step": 2090, + "time_per_iteration": 2.682352304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_mlp": 1.07429934, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.07587120880411238, + "language_loss": 0.8680824, + "learning_rate": 0.0006785342369348222, + "loss": 0.87911433, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.28857422, + "step": 2091, + "time_per_iteration": 2.7333736419677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_mlp": 1.07579792, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.07069251800195664, + "language_loss": 0.7977879, + "learning_rate": 0.0006782431974134316, + "loss": 0.8088339, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.2878418, + "step": 2092, + "time_per_iteration": 2.541607141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105121, + "balance_loss_mlp": 1.0768441, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05426777537327344, + "language_loss": 0.89421535, + "learning_rate": 0.0006779520886918949, + "loss": 0.90526658, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.2824707, + "step": 2093, + "time_per_iteration": 3.035090684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_mlp": 1.07378376, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.07593649947233896, + "language_loss": 0.81461406, + "learning_rate": 0.0006776609108832301, + "loss": 0.82563823, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.28637695, + "step": 2094, + "time_per_iteration": 2.8035519123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102, + "balance_loss_mlp": 1.07398582, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.07164022458424311, + "language_loss": 0.85034972, + "learning_rate": 0.0006773696641004828, + "loss": 0.86136973, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.28027344, + "step": 2095, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.07147717, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.07309254376996902, + "language_loss": 0.77576917, + "learning_rate": 0.0006770783484567247, + "loss": 0.78676933, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.28515625, + "step": 2096, + "time_per_iteration": 3.1005897521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.06557441, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.04872529153034484, + "language_loss": 0.86118937, + "learning_rate": 0.000676786964065055, + "loss": 0.87212431, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.27978516, + "step": 2097, + "time_per_iteration": 2.78965163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093986, + "balance_loss_mlp": 1.06680584, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.06867709967223685, + "language_loss": 0.78839391, + "learning_rate": 0.0006764955110385986, + "loss": 0.79933375, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.2722168, + "step": 2098, + "time_per_iteration": 2.7579219341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.06361151, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.0577520756279271, + "language_loss": 0.80600876, + "learning_rate": 0.0006762039894905083, + "loss": 0.81691736, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.27294922, + "step": 2099, + "time_per_iteration": 2.632434129714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05595064, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06925599284799831, + "language_loss": 0.80233157, + "learning_rate": 0.000675912399533962, + "loss": 0.8131665, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.27563477, + "step": 2100, + "time_per_iteration": 2.521758556365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086411, + "balance_loss_mlp": 1.05947018, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.05734073179456058, + "language_loss": 0.84850854, + "learning_rate": 0.0006756207412821656, + "loss": 0.85937262, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.26977539, + "step": 2101, + "time_per_iteration": 3.043041944503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079398, + "balance_loss_mlp": 1.05245721, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.07220576126006613, + "language_loss": 0.80240154, + "learning_rate": 0.0006753290148483505, + "loss": 0.81319559, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.27001953, + "step": 2102, + "time_per_iteration": 3.0245606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_mlp": 1.05726886, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.06170005058098184, + "language_loss": 0.78875476, + "learning_rate": 0.0006750372203457752, + "loss": 0.79960519, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.27832031, + "step": 2103, + "time_per_iteration": 2.484698534011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078758, + "balance_loss_mlp": 1.05131626, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.05090920908511917, + "language_loss": 0.86534655, + "learning_rate": 0.0006747453578877242, + "loss": 0.87613416, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.27490234, + "step": 2104, + "time_per_iteration": 2.69670033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081019, + "balance_loss_mlp": 1.05281401, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.06546748387286302, + "language_loss": 0.8289392, + "learning_rate": 0.0006744534275875085, + "loss": 0.83974934, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.28222656, + "step": 2105, + "time_per_iteration": 2.9919168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.05620074, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.0635527467859112, + "language_loss": 0.8582921, + "learning_rate": 0.0006741614295584657, + "loss": 0.86912322, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.26977539, + "step": 2106, + "time_per_iteration": 2.6488401889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107849, + "balance_loss_mlp": 1.05073833, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.057690605181557136, + "language_loss": 0.78413224, + "learning_rate": 0.0006738693639139595, + "loss": 0.79491717, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.27807617, + "step": 2107, + "time_per_iteration": 2.9652647972106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078123, + "balance_loss_mlp": 1.05015635, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05945372540383898, + "language_loss": 0.77655667, + "learning_rate": 0.0006735772307673796, + "loss": 0.78733784, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.27978516, + "step": 2108, + "time_per_iteration": 3.5789337158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079955, + "balance_loss_mlp": 1.05222702, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.05752735064114104, + "language_loss": 0.83347392, + "learning_rate": 0.0006732850302321421, + "loss": 0.84427351, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.27783203, + "step": 2109, + "time_per_iteration": 2.869591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078846, + "balance_loss_mlp": 1.051476, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.06455621073123653, + "language_loss": 0.84327263, + "learning_rate": 0.00067299276242169, + "loss": 0.85406113, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.27441406, + "step": 2110, + "time_per_iteration": 2.673659563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.07071877, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.036236061846660186, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75464427, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.11523438, + "step": 2111, + "time_per_iteration": 4.886230230331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082274, + "balance_loss_mlp": 1.05490351, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05646906793429633, + "language_loss": 0.77664089, + "learning_rate": 0.0006724080254290395, + "loss": 0.78746361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.27416992, + "step": 2112, + "time_per_iteration": 2.8506221771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04847741, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.06356712121797842, + "language_loss": 0.89422435, + "learning_rate": 0.0006721155564738566, + "loss": 0.90498972, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.28100586, + "step": 2113, + "time_per_iteration": 2.673015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_mlp": 1.02626586, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.019828324636468348, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79660642, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.1171875, + "step": 2114, + "time_per_iteration": 5.003857851028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080097, + "balance_loss_mlp": 1.0521065, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07124796283110259, + "language_loss": 0.85397822, + "learning_rate": 0.0006715304182135078, + "loss": 0.86477917, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.2800293, + "step": 2115, + "time_per_iteration": 2.641721248626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.05418694, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.08996962933736626, + "language_loss": 0.88862896, + "learning_rate": 0.0006712377491355127, + "loss": 0.89945835, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.28735352, + "step": 2116, + "time_per_iteration": 2.880159616470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077208, + "balance_loss_mlp": 1.04857373, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.046629180459365246, + "language_loss": 0.81631374, + "learning_rate": 0.0006709450135771274, + "loss": 0.82708585, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.28637695, + "step": 2117, + "time_per_iteration": 2.9391822814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.04953849, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05926883506924263, + "language_loss": 0.86382973, + "learning_rate": 0.0006706522116520023, + "loss": 0.87459958, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.27490234, + "step": 2118, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_mlp": 1.05072808, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.06371775766221305, + "language_loss": 0.82902479, + "learning_rate": 0.0006703593434738127, + "loss": 0.83981442, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.28222656, + "step": 2119, + "time_per_iteration": 2.6982903480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080441, + "balance_loss_mlp": 1.05216455, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.05030428863920766, + "language_loss": 0.78137958, + "learning_rate": 0.0006700664091562604, + "loss": 0.792184, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.28271484, + "step": 2120, + "time_per_iteration": 2.5976343154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081224, + "balance_loss_mlp": 1.05259037, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.05481620044617693, + "language_loss": 0.85151196, + "learning_rate": 0.0006697734088130725, + "loss": 0.86232412, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.28637695, + "step": 2121, + "time_per_iteration": 2.613192558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_mlp": 1.05665159, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.0674188074849357, + "language_loss": 0.85445356, + "learning_rate": 0.0006694803425580018, + "loss": 0.86531019, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.28955078, + "step": 2122, + "time_per_iteration": 2.9808695316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_mlp": 1.05585766, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.06189748292204317, + "language_loss": 0.8466748, + "learning_rate": 0.0006691872105048268, + "loss": 0.85753286, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.29907227, + "step": 2123, + "time_per_iteration": 2.5712099075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_mlp": 1.05992901, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.06907127419859461, + "language_loss": 0.84616292, + "learning_rate": 0.0006688940127673513, + "loss": 0.85705543, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.29296875, + "step": 2124, + "time_per_iteration": 2.6865010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091737, + "balance_loss_mlp": 1.06181526, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.048409192362904495, + "language_loss": 0.85410631, + "learning_rate": 0.0006686007494594049, + "loss": 0.86502367, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.29882812, + "step": 2125, + "time_per_iteration": 2.8982856273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.06085694, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.07961338986962259, + "language_loss": 0.80014485, + "learning_rate": 0.0006683074206948425, + "loss": 0.81105095, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.29736328, + "step": 2126, + "time_per_iteration": 2.489884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086751, + "balance_loss_mlp": 1.05649602, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.06572114620312723, + "language_loss": 0.81335235, + "learning_rate": 0.0006680140265875443, + "loss": 0.82421982, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.30200195, + "step": 2127, + "time_per_iteration": 2.8000454902648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05512488, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.054748250322007024, + "language_loss": 0.95437354, + "learning_rate": 0.0006677205672514162, + "loss": 0.9652164, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.29125977, + "step": 2128, + "time_per_iteration": 2.6153228282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05600977, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.05206451104952603, + "language_loss": 0.88892365, + "learning_rate": 0.000667427042800389, + "loss": 0.89978707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.30273438, + "step": 2129, + "time_per_iteration": 2.772545337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080649, + "balance_loss_mlp": 1.0521338, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.06928662998118869, + "language_loss": 0.82843542, + "learning_rate": 0.0006671334533484192, + "loss": 0.83924192, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.28515625, + "step": 2130, + "time_per_iteration": 2.7501790523529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077969, + "balance_loss_mlp": 1.04938281, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.051614263088568736, + "language_loss": 0.83230782, + "learning_rate": 0.0006668397990094881, + "loss": 0.84308755, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.28613281, + "step": 2131, + "time_per_iteration": 2.7121975421905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083028, + "balance_loss_mlp": 1.05370235, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05828514658280376, + "language_loss": 0.84553468, + "learning_rate": 0.0006665460798976027, + "loss": 0.85636497, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.29296875, + "step": 2132, + "time_per_iteration": 2.7074639797210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082859, + "balance_loss_mlp": 1.05532122, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.06450815869750301, + "language_loss": 0.81324267, + "learning_rate": 0.0006662522961267947, + "loss": 0.82407123, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.27563477, + "step": 2133, + "time_per_iteration": 2.676886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.05555081, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.04843791936563358, + "language_loss": 0.87077558, + "learning_rate": 0.0006659584478111211, + "loss": 0.88161933, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.28833008, + "step": 2134, + "time_per_iteration": 2.8004117012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06910408, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.07835760686868988, + "language_loss": 0.82880664, + "learning_rate": 0.000665664535064664, + "loss": 0.83977091, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.2734375, + "step": 2135, + "time_per_iteration": 3.034134864807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_mlp": 1.07278681, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05799734322971953, + "language_loss": 0.82382762, + "learning_rate": 0.0006653705580015303, + "loss": 0.8348338, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.27819824, + "step": 2136, + "time_per_iteration": 2.719423770904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105373, + "balance_loss_mlp": 1.07747769, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.05212184008762054, + "language_loss": 0.863967, + "learning_rate": 0.0006650765167358523, + "loss": 0.87502074, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.27905273, + "step": 2137, + "time_per_iteration": 2.7973241806030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110879, + "balance_loss_mlp": 1.08089471, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.07588683613844963, + "language_loss": 0.89871359, + "learning_rate": 0.0006647824113817864, + "loss": 0.90980148, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.27929688, + "step": 2138, + "time_per_iteration": 2.520531177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114294, + "balance_loss_mlp": 1.08768606, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.055552110514209885, + "language_loss": 0.81525648, + "learning_rate": 0.000664488242053515, + "loss": 0.82639945, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.26660156, + "step": 2139, + "time_per_iteration": 2.7204349040985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099437, + "balance_loss_mlp": 1.0722574, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.05646005524415558, + "language_loss": 0.83858913, + "learning_rate": 0.0006641940088652445, + "loss": 0.84958351, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.27246094, + "step": 2140, + "time_per_iteration": 2.748011827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.07521284, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05970845599818087, + "language_loss": 0.81979877, + "learning_rate": 0.0006638997119312065, + "loss": 0.83081794, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.26757812, + "step": 2141, + "time_per_iteration": 2.723269462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.07826746, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.04300629071925061, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76154923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.13378906, + "step": 2142, + "time_per_iteration": 4.922248363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089912, + "balance_loss_mlp": 1.06239891, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06629114096949819, + "language_loss": 0.8462221, + "learning_rate": 0.000663310927282877, + "loss": 0.85712123, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.27563477, + "step": 2143, + "time_per_iteration": 2.8463313579559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06413746, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05519054049820913, + "language_loss": 0.86099815, + "learning_rate": 0.000663016439797172, + "loss": 0.87191272, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.2734375, + "step": 2144, + "time_per_iteration": 2.611057996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.05917096, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.07082455066013048, + "language_loss": 0.80582112, + "learning_rate": 0.0006627218890228724, + "loss": 0.81669062, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.27783203, + "step": 2145, + "time_per_iteration": 2.8047831058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.05859172, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.08398112437337095, + "language_loss": 0.83330071, + "learning_rate": 0.0006624272750743326, + "loss": 0.84417343, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.28637695, + "step": 2146, + "time_per_iteration": 2.9890313148498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081748, + "balance_loss_mlp": 1.05299461, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.12117217429962603, + "language_loss": 0.82466137, + "learning_rate": 0.0006621325980659322, + "loss": 0.83547878, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.2878418, + "step": 2147, + "time_per_iteration": 2.7945189476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.05475557, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.05729870278054163, + "language_loss": 0.81810451, + "learning_rate": 0.000661837858112075, + "loss": 0.82893538, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.28320312, + "step": 2148, + "time_per_iteration": 2.833590030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05102634, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.05837233957282785, + "language_loss": 0.88857764, + "learning_rate": 0.0006615430553271888, + "loss": 0.89937091, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.28344727, + "step": 2149, + "time_per_iteration": 2.75384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04603195, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.06498878822354702, + "language_loss": 0.85069597, + "learning_rate": 0.0006612481898257264, + "loss": 0.86143911, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.28295898, + "step": 2150, + "time_per_iteration": 2.8471391201019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.04901028, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.06146250241107021, + "language_loss": 0.85024071, + "learning_rate": 0.000660953261722165, + "loss": 0.8610152, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.28442383, + "step": 2151, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04643118, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.07635609550069686, + "language_loss": 0.82408941, + "learning_rate": 0.0006606582711310055, + "loss": 0.8348453, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.29150391, + "step": 2152, + "time_per_iteration": 2.707353353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079486, + "balance_loss_mlp": 1.05068457, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.05643811624839042, + "language_loss": 0.83234471, + "learning_rate": 0.0006603632181667736, + "loss": 0.84313959, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.2878418, + "step": 2153, + "time_per_iteration": 2.6824803352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_mlp": 1.02085698, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.02554992861291058, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79978293, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.14160156, + "step": 2154, + "time_per_iteration": 4.893488645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075294, + "balance_loss_mlp": 1.04625416, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.06235301652291857, + "language_loss": 0.81530857, + "learning_rate": 0.0006597729255773153, + "loss": 0.82606155, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.2902832, + "step": 2155, + "time_per_iteration": 2.526531934738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084546, + "balance_loss_mlp": 1.05519629, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.06680223734216864, + "language_loss": 0.82554018, + "learning_rate": 0.0006594776861812608, + "loss": 0.83638561, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.29321289, + "step": 2156, + "time_per_iteration": 2.669290065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_mlp": 1.05525446, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.05896575190253656, + "language_loss": 0.8669672, + "learning_rate": 0.0006591823848704776, + "loss": 0.87780631, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.28613281, + "step": 2157, + "time_per_iteration": 2.9277596473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081796, + "balance_loss_mlp": 1.05273294, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.07853922010281017, + "language_loss": 0.81488264, + "learning_rate": 0.0006588870217596117, + "loss": 0.82570058, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.29003906, + "step": 2158, + "time_per_iteration": 2.72590970993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107553, + "balance_loss_mlp": 1.04572749, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.06749140584983894, + "language_loss": 0.86219651, + "learning_rate": 0.0006585915969633334, + "loss": 0.87295187, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.29760742, + "step": 2159, + "time_per_iteration": 2.609668731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068571, + "balance_loss_mlp": 1.03838706, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.0643598430263329, + "language_loss": 0.89336061, + "learning_rate": 0.0006582961105963366, + "loss": 0.90404636, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.30151367, + "step": 2160, + "time_per_iteration": 2.814122200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04409909, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.0615363131016327, + "language_loss": 0.77864838, + "learning_rate": 0.0006580005627733395, + "loss": 0.78939116, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.30126953, + "step": 2161, + "time_per_iteration": 2.693002700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03790569, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.07091162327263066, + "language_loss": 0.81523043, + "learning_rate": 0.0006577049536090838, + "loss": 0.82590109, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.29125977, + "step": 2162, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.04039741, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07952336976051765, + "language_loss": 0.85617888, + "learning_rate": 0.000657409283218335, + "loss": 0.86688089, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.29760742, + "step": 2163, + "time_per_iteration": 2.663069486618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.04075933, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.06199265882265987, + "language_loss": 0.81197548, + "learning_rate": 0.0006571135517158829, + "loss": 0.82267773, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.29394531, + "step": 2164, + "time_per_iteration": 2.6750965118408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_mlp": 1.03042102, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.030179808177232596, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807546, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.13085938, + "step": 2165, + "time_per_iteration": 4.7519471645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.0417223, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.06526247046532782, + "language_loss": 0.83270538, + "learning_rate": 0.0006565219058351444, + "loss": 0.84342444, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.30151367, + "step": 2166, + "time_per_iteration": 2.5784192085266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.04080534, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.06219532105294632, + "language_loss": 0.82938039, + "learning_rate": 0.0006562259916865553, + "loss": 0.84009004, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.30102539, + "step": 2167, + "time_per_iteration": 2.59431791305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073926, + "balance_loss_mlp": 1.04369497, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.06573475594481314, + "language_loss": 0.7943427, + "learning_rate": 0.0006559300168856573, + "loss": 0.80508196, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.30175781, + "step": 2168, + "time_per_iteration": 2.727644443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070483, + "balance_loss_mlp": 1.04046655, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.17889612534981147, + "language_loss": 0.85705924, + "learning_rate": 0.0006556339815473577, + "loss": 0.86776412, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.29980469, + "step": 2169, + "time_per_iteration": 2.6300487518310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072561, + "balance_loss_mlp": 1.04366493, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.053042429294564375, + "language_loss": 0.86056256, + "learning_rate": 0.000655337885786588, + "loss": 0.87128818, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.2890625, + "step": 2170, + "time_per_iteration": 2.8887124061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081102, + "balance_loss_mlp": 1.05139482, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.08227745310603136, + "language_loss": 0.84896123, + "learning_rate": 0.0006550417297183025, + "loss": 0.85977226, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.29663086, + "step": 2171, + "time_per_iteration": 2.6285011768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088317, + "balance_loss_mlp": 1.05894339, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.05761128029173598, + "language_loss": 0.81863701, + "learning_rate": 0.0006547455134574793, + "loss": 0.82952011, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.29321289, + "step": 2172, + "time_per_iteration": 2.7729623317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.06040442, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.06792239619892874, + "language_loss": 0.83893955, + "learning_rate": 0.0006544492371191198, + "loss": 0.84983015, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.28613281, + "step": 2173, + "time_per_iteration": 3.1256158351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094435, + "balance_loss_mlp": 1.06477547, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.05504184984792058, + "language_loss": 0.83198339, + "learning_rate": 0.0006541529008182485, + "loss": 0.84292769, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.29638672, + "step": 2174, + "time_per_iteration": 3.207711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.0648396, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.07199426026259947, + "language_loss": 0.87529659, + "learning_rate": 0.0006538565046699136, + "loss": 0.88623327, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.28808594, + "step": 2175, + "time_per_iteration": 2.5804800987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090181, + "balance_loss_mlp": 1.06207108, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.06367136059390696, + "language_loss": 0.80982441, + "learning_rate": 0.0006535600487891862, + "loss": 0.82072628, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.28149414, + "step": 2176, + "time_per_iteration": 2.7804555892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087535, + "balance_loss_mlp": 1.05870986, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05631892460787088, + "language_loss": 0.89099276, + "learning_rate": 0.0006532635332911603, + "loss": 0.9018681, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.28808594, + "step": 2177, + "time_per_iteration": 2.641392707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083587, + "balance_loss_mlp": 1.05428553, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.06086903625614387, + "language_loss": 0.80636132, + "learning_rate": 0.0006529669582909541, + "loss": 0.8171972, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.29296875, + "step": 2178, + "time_per_iteration": 3.2258243560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079831, + "balance_loss_mlp": 1.0508393, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06798611784395944, + "language_loss": 0.85681045, + "learning_rate": 0.0006526703239037077, + "loss": 0.86760873, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.28955078, + "step": 2179, + "time_per_iteration": 2.66808819770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0480361, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.06231650691948033, + "language_loss": 0.86236274, + "learning_rate": 0.0006523736302445851, + "loss": 0.87313515, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.29174805, + "step": 2180, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04490554, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05646655403971755, + "language_loss": 0.77122605, + "learning_rate": 0.0006520768774287728, + "loss": 0.78197432, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.29882812, + "step": 2181, + "time_per_iteration": 3.7851996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077657, + "balance_loss_mlp": 1.04899919, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.05195874321999793, + "language_loss": 0.85622293, + "learning_rate": 0.0006517800655714806, + "loss": 0.86699945, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.28686523, + "step": 2182, + "time_per_iteration": 2.8000948429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.05359161, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.06393427474455515, + "language_loss": 0.85246432, + "learning_rate": 0.0006514831947879407, + "loss": 0.86329615, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.2956543, + "step": 2183, + "time_per_iteration": 2.946345329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090824, + "balance_loss_mlp": 1.06164193, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05990675678964555, + "language_loss": 0.78013611, + "learning_rate": 0.0006511862651934091, + "loss": 0.79104435, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.29174805, + "step": 2184, + "time_per_iteration": 3.043314218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087348, + "balance_loss_mlp": 1.05797458, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.05608517861748944, + "language_loss": 0.82263517, + "learning_rate": 0.0006508892769031638, + "loss": 0.83350861, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.29345703, + "step": 2185, + "time_per_iteration": 2.662071704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090134, + "balance_loss_mlp": 1.06052232, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.07931700187887496, + "language_loss": 0.86476076, + "learning_rate": 0.000650592230032506, + "loss": 0.87566209, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.2956543, + "step": 2186, + "time_per_iteration": 2.758989095687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094562, + "balance_loss_mlp": 1.06464052, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.06900651751722174, + "language_loss": 0.84912258, + "learning_rate": 0.0006502951246967595, + "loss": 0.8600682, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.29882812, + "step": 2187, + "time_per_iteration": 2.9305953979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.06274199, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.061550495040686125, + "language_loss": 0.86992055, + "learning_rate": 0.0006499979610112706, + "loss": 0.88084006, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.29150391, + "step": 2188, + "time_per_iteration": 2.6826889514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091259, + "balance_loss_mlp": 1.06205249, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05090003048385584, + "language_loss": 0.84021527, + "learning_rate": 0.000649700739091409, + "loss": 0.85112786, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.29125977, + "step": 2189, + "time_per_iteration": 2.7169277667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.04628468, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.03212522571547254, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74894285, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.1171875, + "step": 2190, + "time_per_iteration": 4.8044211864471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094227, + "balance_loss_mlp": 1.06645083, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.05853660814181512, + "language_loss": 0.85258055, + "learning_rate": 0.0006491061210101557, + "loss": 0.86352277, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.27832031, + "step": 2191, + "time_per_iteration": 2.6850759983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093463, + "balance_loss_mlp": 1.06554449, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.05791259848064641, + "language_loss": 0.84111977, + "learning_rate": 0.0006488087250796157, + "loss": 0.85205436, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.27905273, + "step": 2192, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099215, + "balance_loss_mlp": 1.07148743, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.0649444731235166, + "language_loss": 0.81518376, + "learning_rate": 0.0006485112713764049, + "loss": 0.82617593, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.27734375, + "step": 2193, + "time_per_iteration": 2.910949468612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102268, + "balance_loss_mlp": 1.07523096, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.07813881123096035, + "language_loss": 0.83433115, + "learning_rate": 0.0006482137600160051, + "loss": 0.84535384, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.27075195, + "step": 2194, + "time_per_iteration": 2.5086262226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096994, + "balance_loss_mlp": 1.06900394, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.07794223585413998, + "language_loss": 0.84987926, + "learning_rate": 0.0006479161911139206, + "loss": 0.86084926, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.2800293, + "step": 2195, + "time_per_iteration": 2.5875346660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109264, + "balance_loss_mlp": 1.06493604, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.07304716613473786, + "language_loss": 0.85472345, + "learning_rate": 0.0006476185647856778, + "loss": 0.86564982, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.27734375, + "step": 2196, + "time_per_iteration": 2.5596694946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083263, + "balance_loss_mlp": 1.05589223, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.0787732151202365, + "language_loss": 0.81599677, + "learning_rate": 0.0006473208811468255, + "loss": 0.82682943, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.27416992, + "step": 2197, + "time_per_iteration": 2.8756632804870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.05518579, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05582038208417147, + "language_loss": 0.84304923, + "learning_rate": 0.0006470231403129347, + "loss": 0.85387599, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.27490234, + "step": 2198, + "time_per_iteration": 2.6008548736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082097, + "balance_loss_mlp": 1.05444098, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.05486589756973033, + "language_loss": 0.81627637, + "learning_rate": 0.0006467253423995988, + "loss": 0.8270973, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.27685547, + "step": 2199, + "time_per_iteration": 2.8359298706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085734, + "balance_loss_mlp": 1.05788624, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.06443704109820439, + "language_loss": 0.79415488, + "learning_rate": 0.000646427487522433, + "loss": 0.80501223, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.27880859, + "step": 2200, + "time_per_iteration": 2.6884772777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089933, + "balance_loss_mlp": 1.06251502, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.06462007516901433, + "language_loss": 0.83460814, + "learning_rate": 0.0006461295757970749, + "loss": 0.8455075, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.27441406, + "step": 2201, + "time_per_iteration": 2.7960758209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_mlp": 1.07140875, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.08363319364773283, + "language_loss": 0.81312859, + "learning_rate": 0.0006458316073391839, + "loss": 0.82413375, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.29101562, + "step": 2202, + "time_per_iteration": 2.853297472000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096557, + "balance_loss_mlp": 1.06830478, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.0711769658628502, + "language_loss": 0.87750852, + "learning_rate": 0.0006455335822644422, + "loss": 0.88847411, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.28271484, + "step": 2203, + "time_per_iteration": 2.6077048778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110502, + "balance_loss_mlp": 1.07607579, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.061615225293076246, + "language_loss": 0.77729923, + "learning_rate": 0.0006452355006885527, + "loss": 0.78834939, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.28930664, + "step": 2204, + "time_per_iteration": 2.6517252922058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103628, + "balance_loss_mlp": 1.07442212, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.1220032897030914, + "language_loss": 0.86957574, + "learning_rate": 0.0006449373627272412, + "loss": 0.88061202, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.29199219, + "step": 2205, + "time_per_iteration": 2.7004148960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093739, + "balance_loss_mlp": 1.06515288, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.07705045910796138, + "language_loss": 0.82556224, + "learning_rate": 0.0006446391684962553, + "loss": 0.83649963, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.28588867, + "step": 2206, + "time_per_iteration": 2.6505441665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.05558801, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.0589868983385633, + "language_loss": 0.82958955, + "learning_rate": 0.000644340918111364, + "loss": 0.84042698, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.28149414, + "step": 2207, + "time_per_iteration": 2.6410183906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079008, + "balance_loss_mlp": 1.05011129, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05680611388250626, + "language_loss": 0.84805965, + "learning_rate": 0.0006440426116883585, + "loss": 0.8588497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.28857422, + "step": 2208, + "time_per_iteration": 2.5708625316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074083, + "balance_loss_mlp": 1.04478097, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.06224422813064936, + "language_loss": 0.86093891, + "learning_rate": 0.0006437442493430519, + "loss": 0.87167978, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.29248047, + "step": 2209, + "time_per_iteration": 2.70894718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074378, + "balance_loss_mlp": 1.04481411, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.07482969618411565, + "language_loss": 0.86115217, + "learning_rate": 0.000643445831191278, + "loss": 0.87189603, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.29492188, + "step": 2210, + "time_per_iteration": 2.924381971359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076507, + "balance_loss_mlp": 1.0465858, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.07331466132736943, + "language_loss": 0.81421846, + "learning_rate": 0.0006431473573488937, + "loss": 0.82498354, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.29882812, + "step": 2211, + "time_per_iteration": 2.7787976264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.04380631, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.07883329281510759, + "language_loss": 0.84917492, + "learning_rate": 0.0006428488279317765, + "loss": 0.85990787, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.29443359, + "step": 2212, + "time_per_iteration": 2.6664369106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070733, + "balance_loss_mlp": 1.04052496, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.06306745469338368, + "language_loss": 0.87706983, + "learning_rate": 0.0006425502430558259, + "loss": 0.88777709, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.30151367, + "step": 2213, + "time_per_iteration": 2.6229989528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04106641, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.0655798606724697, + "language_loss": 0.84705913, + "learning_rate": 0.0006422516028369628, + "loss": 0.8577702, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.30004883, + "step": 2214, + "time_per_iteration": 2.69012451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072564, + "balance_loss_mlp": 1.04197454, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.08051577462794157, + "language_loss": 0.83543354, + "learning_rate": 0.0006419529073911296, + "loss": 0.84615922, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.30541992, + "step": 2215, + "time_per_iteration": 2.873396873474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070818, + "balance_loss_mlp": 1.03987157, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05918367623789858, + "language_loss": 0.85362011, + "learning_rate": 0.0006416541568342901, + "loss": 0.86432827, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.30908203, + "step": 2216, + "time_per_iteration": 2.870213508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071511, + "balance_loss_mlp": 1.04161358, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.06028802274016953, + "language_loss": 0.8413707, + "learning_rate": 0.0006413553512824297, + "loss": 0.85208583, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.29858398, + "step": 2217, + "time_per_iteration": 2.7570102214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066011, + "balance_loss_mlp": 1.03599358, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.06136950817587928, + "language_loss": 0.8441695, + "learning_rate": 0.0006410564908515549, + "loss": 0.85482961, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.29980469, + "step": 2218, + "time_per_iteration": 2.634636878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.05945328981992575, + "language_loss": 0.85267186, + "learning_rate": 0.0006407575756576935, + "loss": 0.8633939, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.30957031, + "step": 2219, + "time_per_iteration": 2.7264437675476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076309, + "balance_loss_mlp": 1.04512346, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.08352776642532155, + "language_loss": 0.87413085, + "learning_rate": 0.0006404586058168951, + "loss": 0.88489389, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.31152344, + "step": 2220, + "time_per_iteration": 2.740231513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070252, + "balance_loss_mlp": 1.03906727, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.06337599132559579, + "language_loss": 0.86675316, + "learning_rate": 0.0006401595814452296, + "loss": 0.87745565, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.31152344, + "step": 2221, + "time_per_iteration": 2.595133066177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04316878, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05998559409639075, + "language_loss": 0.80837309, + "learning_rate": 0.000639860502658789, + "loss": 0.81910712, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.30224609, + "step": 2222, + "time_per_iteration": 2.6363143920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078431, + "balance_loss_mlp": 1.04805684, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.051235249414951084, + "language_loss": 0.85047621, + "learning_rate": 0.0006395613695736853, + "loss": 0.86126053, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.3034668, + "step": 2223, + "time_per_iteration": 2.719651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.0574553, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.14370485886555942, + "language_loss": 0.82013905, + "learning_rate": 0.0006392621823060529, + "loss": 0.83102709, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.31347656, + "step": 2224, + "time_per_iteration": 2.707019805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.04968464, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.06727581417341866, + "language_loss": 0.84405053, + "learning_rate": 0.0006389629409720465, + "loss": 0.85485303, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.30541992, + "step": 2225, + "time_per_iteration": 2.6877145767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04415512, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.06967859590672425, + "language_loss": 0.88595277, + "learning_rate": 0.0006386636456878417, + "loss": 0.89670026, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.30566406, + "step": 2226, + "time_per_iteration": 2.87302827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.04344106, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.07126154474787791, + "language_loss": 0.92022073, + "learning_rate": 0.0006383642965696353, + "loss": 0.93095744, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.30175781, + "step": 2227, + "time_per_iteration": 2.4469897747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.04492915, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06843530557124561, + "language_loss": 0.82703793, + "learning_rate": 0.000638064893733645, + "loss": 0.83779144, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.30371094, + "step": 2228, + "time_per_iteration": 2.7728607654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071747, + "balance_loss_mlp": 1.04256451, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.058089035035371744, + "language_loss": 0.89580554, + "learning_rate": 0.000637765437296109, + "loss": 0.90652299, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.29199219, + "step": 2229, + "time_per_iteration": 2.634521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04252505, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.07373798457938027, + "language_loss": 0.85480672, + "learning_rate": 0.000637465927373287, + "loss": 0.86553335, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.30126953, + "step": 2230, + "time_per_iteration": 2.6294057369232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082832, + "balance_loss_mlp": 1.05276728, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.08134114280474665, + "language_loss": 0.79152465, + "learning_rate": 0.000637166364081459, + "loss": 0.80235291, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.30004883, + "step": 2231, + "time_per_iteration": 2.651043176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.04837155, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.0656552791827552, + "language_loss": 0.83965945, + "learning_rate": 0.0006368667475369256, + "loss": 0.85042852, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.28515625, + "step": 2232, + "time_per_iteration": 2.749769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072336, + "balance_loss_mlp": 1.05898428, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.038311067760931045, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79600114, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.13378906, + "step": 2233, + "time_per_iteration": 4.919846773147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_mlp": 1.04044378, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.026216416348918452, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79949123, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.1328125, + "step": 2234, + "time_per_iteration": 4.814115285873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109183, + "balance_loss_mlp": 1.06281483, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.052673535005773216, + "language_loss": 0.85474288, + "learning_rate": 0.0006359675795504112, + "loss": 0.86566114, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.29003906, + "step": 2235, + "time_per_iteration": 2.7002832889556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097467, + "balance_loss_mlp": 1.07021558, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.08125384058814748, + "language_loss": 0.74334383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75431848, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.27294922, + "step": 2236, + "time_per_iteration": 3.472095012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096497, + "balance_loss_mlp": 1.06938839, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.06719636161557083, + "language_loss": 0.85933757, + "learning_rate": 0.0006353678700956511, + "loss": 0.8703025, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.27148438, + "step": 2237, + "time_per_iteration": 2.6188535690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089994, + "balance_loss_mlp": 1.06288612, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.09054713742221257, + "language_loss": 0.83597302, + "learning_rate": 0.0006350679364783569, + "loss": 0.84687304, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.27172852, + "step": 2238, + "time_per_iteration": 2.7403035163879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093799, + "balance_loss_mlp": 1.0661664, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.06694912929746479, + "language_loss": 0.85728157, + "learning_rate": 0.0006347679504230393, + "loss": 0.86821961, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.27661133, + "step": 2239, + "time_per_iteration": 2.652348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087161, + "balance_loss_mlp": 1.05974269, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.056527008755361936, + "language_loss": 0.75895661, + "learning_rate": 0.0006344679120461632, + "loss": 0.7698282, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.27416992, + "step": 2240, + "time_per_iteration": 3.334127187728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091078, + "balance_loss_mlp": 1.06435084, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.1917370324350853, + "language_loss": 0.80061769, + "learning_rate": 0.0006341678214642134, + "loss": 0.81152856, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.26782227, + "step": 2241, + "time_per_iteration": 2.6100823879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087616, + "balance_loss_mlp": 1.06103277, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06088249389193946, + "language_loss": 0.82893783, + "learning_rate": 0.0006338676787936963, + "loss": 0.83981395, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.26635742, + "step": 2242, + "time_per_iteration": 3.077916383743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.07142353, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.060062439107852666, + "language_loss": 0.8377043, + "learning_rate": 0.0006335674841511367, + "loss": 0.84868383, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.26586914, + "step": 2243, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05415499, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03077915513708162, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80247629, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.12255859, + "step": 2244, + "time_per_iteration": 5.000265121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060995, + "balance_loss_mlp": 1.04878819, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.03064763148494063, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7842654, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.12207031, + "step": 2245, + "time_per_iteration": 4.9160850048065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093506, + "balance_loss_mlp": 1.06594431, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.06803490831657065, + "language_loss": 0.82597309, + "learning_rate": 0.0006326665895567652, + "loss": 0.83690816, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.2755127, + "step": 2246, + "time_per_iteration": 2.6449503898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.05649078, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.07553831830843152, + "language_loss": 0.87537026, + "learning_rate": 0.0006323661881916976, + "loss": 0.88621694, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.28173828, + "step": 2247, + "time_per_iteration": 2.699899911880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088894, + "balance_loss_mlp": 1.05983043, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.05605692822142187, + "language_loss": 0.80999863, + "learning_rate": 0.0006320657354375179, + "loss": 0.82088757, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.2902832, + "step": 2248, + "time_per_iteration": 2.9737963676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.05374026, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.1777496827938913, + "language_loss": 0.87151104, + "learning_rate": 0.0006317652314108726, + "loss": 0.88234049, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.29150391, + "step": 2249, + "time_per_iteration": 2.5640759468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076296, + "balance_loss_mlp": 1.04782867, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.059764616303547735, + "language_loss": 0.91275859, + "learning_rate": 0.0006314646762284277, + "loss": 0.92352152, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.28442383, + "step": 2250, + "time_per_iteration": 2.6878976821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056511, + "balance_loss_mlp": 1.04401791, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.026928771485436313, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76482344, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.125, + "step": 2251, + "time_per_iteration": 4.839360475540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079121, + "balance_loss_mlp": 1.04931927, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.05685438588579276, + "language_loss": 0.77368456, + "learning_rate": 0.0006308634128629022, + "loss": 0.78447574, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.29785156, + "step": 2252, + "time_per_iteration": 2.895348072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_mlp": 1.05426395, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.07214959985253801, + "language_loss": 0.87411779, + "learning_rate": 0.0006305627049132531, + "loss": 0.88495201, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.29125977, + "step": 2253, + "time_per_iteration": 2.8069100379943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.05440617, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.059293193490882155, + "language_loss": 0.85926008, + "learning_rate": 0.0006302619462746662, + "loss": 0.87009549, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.29101562, + "step": 2254, + "time_per_iteration": 3.1606533527374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.05193734, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05505451724174187, + "language_loss": 0.89697909, + "learning_rate": 0.0006299611370639069, + "loss": 0.90777981, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.28149414, + "step": 2255, + "time_per_iteration": 2.734578847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05368638, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.06498253441528982, + "language_loss": 0.79077351, + "learning_rate": 0.0006296602773977593, + "loss": 0.80159676, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.28637695, + "step": 2256, + "time_per_iteration": 2.7210190296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_mlp": 1.0577755, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.06552918038966793, + "language_loss": 0.87430996, + "learning_rate": 0.0006293593673930277, + "loss": 0.88517857, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.2902832, + "step": 2257, + "time_per_iteration": 2.6526098251342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087005, + "balance_loss_mlp": 1.05851448, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.06677812911461618, + "language_loss": 0.78416431, + "learning_rate": 0.0006290584071665358, + "loss": 0.79503441, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.28491211, + "step": 2258, + "time_per_iteration": 2.915259838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_mlp": 1.0575645, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.06990053073214272, + "language_loss": 0.81982124, + "learning_rate": 0.0006287573968351266, + "loss": 0.83068204, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.28515625, + "step": 2259, + "time_per_iteration": 2.5836570262908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082362, + "balance_loss_mlp": 1.05432403, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06494033905479386, + "language_loss": 0.82220829, + "learning_rate": 0.0006284563365156626, + "loss": 0.83303189, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.28076172, + "step": 2260, + "time_per_iteration": 2.815223217010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_mlp": 1.05620956, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.07047722124208498, + "language_loss": 0.87564874, + "learning_rate": 0.0006281552263250261, + "loss": 0.88649434, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.28344727, + "step": 2261, + "time_per_iteration": 2.4715116024017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_mlp": 1.04964256, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.023387556142435376, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81753576, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.12402344, + "step": 2262, + "time_per_iteration": 4.811767101287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084425, + "balance_loss_mlp": 1.05641103, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.062970719214795, + "language_loss": 0.81474411, + "learning_rate": 0.0006275528567978593, + "loss": 0.82558835, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.28051758, + "step": 2263, + "time_per_iteration": 2.9182233810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096573, + "balance_loss_mlp": 1.06877375, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.06472545743832298, + "language_loss": 0.82352197, + "learning_rate": 0.0006272515976951898, + "loss": 0.83448768, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.27832031, + "step": 2264, + "time_per_iteration": 3.137770175933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097325, + "balance_loss_mlp": 1.06852436, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.055887733519337984, + "language_loss": 0.79332447, + "learning_rate": 0.0006269502891890687, + "loss": 0.8042978, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.28759766, + "step": 2265, + "time_per_iteration": 2.9932398796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093111, + "balance_loss_mlp": 1.06526363, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.06217907852457908, + "language_loss": 0.87852293, + "learning_rate": 0.0006266489313964743, + "loss": 0.88945401, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.27880859, + "step": 2266, + "time_per_iteration": 2.720874547958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.06338787, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.05517220152754215, + "language_loss": 0.85363281, + "learning_rate": 0.0006263475244344041, + "loss": 0.86454159, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.27514648, + "step": 2267, + "time_per_iteration": 2.8508987426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089804, + "balance_loss_mlp": 1.06178975, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.061658084399303315, + "language_loss": 0.84817886, + "learning_rate": 0.0006260460684198746, + "loss": 0.85907692, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.28027344, + "step": 2268, + "time_per_iteration": 2.6972851753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091639, + "balance_loss_mlp": 1.06395864, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.07163404822705746, + "language_loss": 0.84593827, + "learning_rate": 0.0006257445634699213, + "loss": 0.85685468, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.27734375, + "step": 2269, + "time_per_iteration": 2.562509298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05565524, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.07106993063326117, + "language_loss": 0.82829607, + "learning_rate": 0.0006254430097015993, + "loss": 0.8391344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.28222656, + "step": 2270, + "time_per_iteration": 2.6713523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054528, + "balance_loss_mlp": 1.04203498, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029151500829202304, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77533615, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.125, + "step": 2271, + "time_per_iteration": 4.761755466461182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_mlp": 1.05801725, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05590316940209524, + "language_loss": 0.85155964, + "learning_rate": 0.0006248397561781609, + "loss": 0.86242455, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.28491211, + "step": 2272, + "time_per_iteration": 2.8541359901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091334, + "balance_loss_mlp": 1.06246173, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.07335127222093174, + "language_loss": 0.8601104, + "learning_rate": 0.0006245380566572482, + "loss": 0.87102377, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.28857422, + "step": 2273, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090326, + "balance_loss_mlp": 1.06200182, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06592567136619501, + "language_loss": 0.76039565, + "learning_rate": 0.0006242363087863744, + "loss": 0.77129889, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.28344727, + "step": 2274, + "time_per_iteration": 2.9512767791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089474, + "balance_loss_mlp": 1.06129336, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.07045204489750885, + "language_loss": 0.86392975, + "learning_rate": 0.0006239345126826878, + "loss": 0.87482452, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.28198242, + "step": 2275, + "time_per_iteration": 2.818574905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081719, + "balance_loss_mlp": 1.05236995, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.06271142699552738, + "language_loss": 0.8405596, + "learning_rate": 0.0006236326684633561, + "loss": 0.85137677, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.29296875, + "step": 2276, + "time_per_iteration": 2.8501060009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088499, + "balance_loss_mlp": 1.05972195, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.08224081940065299, + "language_loss": 0.75057948, + "learning_rate": 0.0006233307762455658, + "loss": 0.76146448, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.28735352, + "step": 2277, + "time_per_iteration": 2.6692187786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079787, + "balance_loss_mlp": 1.05098617, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.1351794781054828, + "language_loss": 0.83103114, + "learning_rate": 0.0006230288361465216, + "loss": 0.84182906, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.2878418, + "step": 2278, + "time_per_iteration": 3.0566518306732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081672, + "balance_loss_mlp": 1.05389631, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0635725084076576, + "language_loss": 0.85047072, + "learning_rate": 0.0006227268482834473, + "loss": 0.86128747, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.27783203, + "step": 2279, + "time_per_iteration": 2.890195608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086149, + "balance_loss_mlp": 1.05811095, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06574285370830908, + "language_loss": 0.87371957, + "learning_rate": 0.000622424812773585, + "loss": 0.88458109, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.28076172, + "step": 2280, + "time_per_iteration": 2.820857524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_mlp": 1.05698299, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.08150674529849485, + "language_loss": 0.80050623, + "learning_rate": 0.000622122729734195, + "loss": 0.81135261, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.27685547, + "step": 2281, + "time_per_iteration": 2.5578882694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090722, + "balance_loss_mlp": 1.06320858, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.05652917217777931, + "language_loss": 0.87423271, + "learning_rate": 0.0006218205992825566, + "loss": 0.88513994, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.27539062, + "step": 2282, + "time_per_iteration": 2.6367194652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05989254, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.06387466426791162, + "language_loss": 0.81580615, + "learning_rate": 0.0006215184215359671, + "loss": 0.82668239, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.27758789, + "step": 2283, + "time_per_iteration": 2.7550642490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109022, + "balance_loss_mlp": 1.06254005, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.06853375358246538, + "language_loss": 0.86762869, + "learning_rate": 0.0006212161966117425, + "loss": 0.87853086, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.27709961, + "step": 2284, + "time_per_iteration": 2.7315139770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093132, + "balance_loss_mlp": 1.06492722, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.06833018750237568, + "language_loss": 0.81347001, + "learning_rate": 0.0006209139246272164, + "loss": 0.82440132, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.28222656, + "step": 2285, + "time_per_iteration": 2.997727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085597, + "balance_loss_mlp": 1.0573678, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.0627571888999813, + "language_loss": 0.81454128, + "learning_rate": 0.0006206116056997421, + "loss": 0.82539719, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.28271484, + "step": 2286, + "time_per_iteration": 2.5523786544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092851, + "balance_loss_mlp": 1.06512272, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.0569936252584843, + "language_loss": 0.82580131, + "learning_rate": 0.0006203092399466892, + "loss": 0.83672982, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.27783203, + "step": 2287, + "time_per_iteration": 2.5256903171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080971, + "balance_loss_mlp": 1.05317175, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.052620788715243595, + "language_loss": 0.85130596, + "learning_rate": 0.0006200068274854473, + "loss": 0.86211562, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.27832031, + "step": 2288, + "time_per_iteration": 2.6666431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089786, + "balance_loss_mlp": 1.06108057, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.05493211856459023, + "language_loss": 0.85969126, + "learning_rate": 0.0006197043684334229, + "loss": 0.87058908, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.28686523, + "step": 2289, + "time_per_iteration": 2.7558815479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093604, + "balance_loss_mlp": 1.0652802, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.06713172204070075, + "language_loss": 0.7966578, + "learning_rate": 0.0006194018629080411, + "loss": 0.80759388, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.28344727, + "step": 2290, + "time_per_iteration": 2.7641310691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095567, + "balance_loss_mlp": 1.06721866, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.06308142018549157, + "language_loss": 0.81759441, + "learning_rate": 0.0006190993110267451, + "loss": 0.8285501, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.28393555, + "step": 2291, + "time_per_iteration": 2.759451389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087327, + "balance_loss_mlp": 1.05959892, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.0663089643389441, + "language_loss": 0.84395695, + "learning_rate": 0.0006187967129069958, + "loss": 0.85483021, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.27758789, + "step": 2292, + "time_per_iteration": 2.5458216667175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.06011844, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05260179709926624, + "language_loss": 0.8707509, + "learning_rate": 0.0006184940686662722, + "loss": 0.88162768, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.27612305, + "step": 2293, + "time_per_iteration": 2.7694880962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05494058, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.055518519655343164, + "language_loss": 0.90020764, + "learning_rate": 0.0006181913784220714, + "loss": 0.91103435, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.27758789, + "step": 2294, + "time_per_iteration": 2.6642205715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.03542924, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.024577707308588242, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81601226, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.12011719, + "step": 2295, + "time_per_iteration": 4.874637842178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084239, + "balance_loss_mlp": 1.05665421, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06513424306559527, + "language_loss": 0.79833972, + "learning_rate": 0.0006175858603933146, + "loss": 0.80918217, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.27612305, + "step": 2296, + "time_per_iteration": 2.9130241870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084408, + "balance_loss_mlp": 1.05665636, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.06251545633736988, + "language_loss": 0.80774343, + "learning_rate": 0.0006172830328438416, + "loss": 0.81858754, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.27783203, + "step": 2297, + "time_per_iteration": 2.953983783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.05460715, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.057534365085963636, + "language_loss": 0.86889625, + "learning_rate": 0.0006169801597610572, + "loss": 0.87972271, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.28051758, + "step": 2298, + "time_per_iteration": 2.7841529846191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087234, + "balance_loss_mlp": 1.05986333, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.0717755554401909, + "language_loss": 0.89631718, + "learning_rate": 0.0006166772412625469, + "loss": 0.90718955, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.27416992, + "step": 2299, + "time_per_iteration": 2.7750232219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087463, + "balance_loss_mlp": 1.05983019, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06473860012868299, + "language_loss": 0.81551421, + "learning_rate": 0.0006163742774659141, + "loss": 0.82638884, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.27661133, + "step": 2300, + "time_per_iteration": 2.8384482860565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092146, + "balance_loss_mlp": 1.06446528, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.0850959758091444, + "language_loss": 0.85627389, + "learning_rate": 0.0006160712684887801, + "loss": 0.86719531, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.27709961, + "step": 2301, + "time_per_iteration": 2.7603278160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_mlp": 1.05813527, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.053898588417471735, + "language_loss": 0.81867981, + "learning_rate": 0.0006157682144487832, + "loss": 0.82952744, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.2668457, + "step": 2302, + "time_per_iteration": 2.7585275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090771, + "balance_loss_mlp": 1.06347191, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.05970343490953875, + "language_loss": 0.82821, + "learning_rate": 0.0006154651154635793, + "loss": 0.83911771, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.2734375, + "step": 2303, + "time_per_iteration": 4.252831697463989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097367, + "balance_loss_mlp": 1.07040215, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05697892496442649, + "language_loss": 0.8468399, + "learning_rate": 0.0006151619716508421, + "loss": 0.85781354, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.27026367, + "step": 2304, + "time_per_iteration": 2.5882937908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_mlp": 1.07442617, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.06572201075979017, + "language_loss": 0.86751652, + "learning_rate": 0.0006148587831282625, + "loss": 0.87853855, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.27807617, + "step": 2305, + "time_per_iteration": 2.6605563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.04066956, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.01894914693526954, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.802288, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12060547, + "step": 2306, + "time_per_iteration": 4.910472631454468 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_mlp": 1.07342601, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06457533715620843, + "language_loss": 0.87372738, + "learning_rate": 0.0006142522724244255, + "loss": 0.88474846, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.28686523, + "step": 2307, + "time_per_iteration": 2.5184578895568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_mlp": 1.03508484, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.015440750347127817, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.7753191, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12109375, + "step": 2308, + "time_per_iteration": 4.880531549453735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_mlp": 1.07668638, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.0625118895390298, + "language_loss": 0.77304882, + "learning_rate": 0.000613645584293942, + "loss": 0.78409487, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.27954102, + "step": 2309, + "time_per_iteration": 2.888929605484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_mlp": 1.07522511, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.05626484670913178, + "language_loss": 0.82863319, + "learning_rate": 0.0006133421739881185, + "loss": 0.83965981, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.27441406, + "step": 2310, + "time_per_iteration": 2.6770823001861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098373, + "balance_loss_mlp": 1.06966734, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.09114290921538859, + "language_loss": 0.82713985, + "learning_rate": 0.0006130387196789605, + "loss": 0.83812356, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.28686523, + "step": 2311, + "time_per_iteration": 2.7363758087158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_mlp": 1.07309198, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.05056880651601303, + "language_loss": 0.84359384, + "learning_rate": 0.0006127352214842795, + "loss": 0.85461748, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.29272461, + "step": 2312, + "time_per_iteration": 3.0277068614959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.06688845, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.06767648502511064, + "language_loss": 0.85424733, + "learning_rate": 0.0006124316795219041, + "loss": 0.8652035, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.28710938, + "step": 2313, + "time_per_iteration": 2.7824032306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.05996561, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.06031488841862457, + "language_loss": 0.8232829, + "learning_rate": 0.0006121280939096794, + "loss": 0.83416176, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.27905273, + "step": 2314, + "time_per_iteration": 2.7414164543151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05901051, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.056993316738708576, + "language_loss": 0.8765316, + "learning_rate": 0.000611824464765468, + "loss": 0.88740778, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.28613281, + "step": 2315, + "time_per_iteration": 2.5894503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020326, + "balance_loss_mlp": 1.00830936, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.018109298143921163, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79615265, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.12011719, + "step": 2316, + "time_per_iteration": 4.654959201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081165, + "balance_loss_mlp": 1.05322254, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.05658516719934989, + "language_loss": 0.85440743, + "learning_rate": 0.000611217076352619, + "loss": 0.86521906, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.27978516, + "step": 2317, + "time_per_iteration": 2.8710198402404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086137, + "balance_loss_mlp": 1.05862343, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.062250172980488426, + "language_loss": 0.82876933, + "learning_rate": 0.0006109133173197905, + "loss": 0.8396306, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.27539062, + "step": 2318, + "time_per_iteration": 2.7298824787139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05986071, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.0706297628000491, + "language_loss": 0.85633492, + "learning_rate": 0.0006106095152265935, + "loss": 0.8672179, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.28466797, + "step": 2319, + "time_per_iteration": 2.8895695209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108895, + "balance_loss_mlp": 1.06086433, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.04876785494191262, + "language_loss": 0.84747481, + "learning_rate": 0.0006103056701909739, + "loss": 0.85836434, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.28125, + "step": 2320, + "time_per_iteration": 2.9117228984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108858, + "balance_loss_mlp": 1.05935025, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.06765559983355682, + "language_loss": 0.82841372, + "learning_rate": 0.0006100017823308956, + "loss": 0.8392995, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.29199219, + "step": 2321, + "time_per_iteration": 3.19189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095794, + "balance_loss_mlp": 1.06618226, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.07493928757304909, + "language_loss": 0.796121, + "learning_rate": 0.0006096978517643377, + "loss": 0.80707896, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.29589844, + "step": 2322, + "time_per_iteration": 2.7803642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088319, + "balance_loss_mlp": 1.05825448, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.05979787162997368, + "language_loss": 0.83128643, + "learning_rate": 0.0006093938786092968, + "loss": 0.84216964, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.30029297, + "step": 2323, + "time_per_iteration": 2.6324985027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_mlp": 1.05403399, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0696967897289199, + "language_loss": 0.89752465, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836924, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.30395508, + "step": 2324, + "time_per_iteration": 2.833986282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.05073011, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05715713314103227, + "language_loss": 0.87296605, + "learning_rate": 0.0006087858050058337, + "loss": 0.88377976, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3059082, + "step": 2325, + "time_per_iteration": 2.8220982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082075, + "balance_loss_mlp": 1.05084252, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06405768205874736, + "language_loss": 0.82704103, + "learning_rate": 0.0006084817047934866, + "loss": 0.83786178, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.31225586, + "step": 2326, + "time_per_iteration": 2.6844918727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077775, + "balance_loss_mlp": 1.04635119, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.06718825176833507, + "language_loss": 0.89515507, + "learning_rate": 0.0006081775624648066, + "loss": 0.90593284, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.31396484, + "step": 2327, + "time_per_iteration": 2.5115904808044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.04937041, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.06388622036462539, + "language_loss": 0.82659936, + "learning_rate": 0.0006078733781378721, + "loss": 0.83740276, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.30957031, + "step": 2328, + "time_per_iteration": 2.5578174591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.04003251, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05909371510774122, + "language_loss": 0.82426572, + "learning_rate": 0.0006075691519307781, + "loss": 0.83497119, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.3046875, + "step": 2329, + "time_per_iteration": 2.9271137714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04025745, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.0899878860138525, + "language_loss": 0.81604564, + "learning_rate": 0.0006072648839616356, + "loss": 0.8267594, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.31103516, + "step": 2330, + "time_per_iteration": 2.642164945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069213, + "balance_loss_mlp": 1.03805184, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.05660389796161562, + "language_loss": 0.82544589, + "learning_rate": 0.0006069605743485718, + "loss": 0.83613807, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3112793, + "step": 2331, + "time_per_iteration": 3.3559155464172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_mlp": 1.04945791, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.06166347857347268, + "language_loss": 0.83528912, + "learning_rate": 0.0006066562232097303, + "loss": 0.84607553, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.29125977, + "step": 2332, + "time_per_iteration": 2.7531135082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107678, + "balance_loss_mlp": 1.0468111, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.0526351904833897, + "language_loss": 0.86127633, + "learning_rate": 0.0006063518306632708, + "loss": 0.87204421, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.29907227, + "step": 2333, + "time_per_iteration": 2.957057476043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080344, + "balance_loss_mlp": 1.05044627, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.07121293699241546, + "language_loss": 0.82098341, + "learning_rate": 0.0006060473968273688, + "loss": 0.83178687, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.29882812, + "step": 2334, + "time_per_iteration": 2.687427043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050724, + "balance_loss_mlp": 1.03756309, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.03308553204338399, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78930265, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.13183594, + "step": 2335, + "time_per_iteration": 4.873494625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_mlp": 1.01476717, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.020404135430742085, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82032573, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.12597656, + "step": 2336, + "time_per_iteration": 4.8493242263793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091959, + "balance_loss_mlp": 1.06327689, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.08823378464345366, + "language_loss": 0.8815735, + "learning_rate": 0.0006051338487650047, + "loss": 0.89249313, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.28686523, + "step": 2337, + "time_per_iteration": 2.4994585514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094323, + "balance_loss_mlp": 1.06595135, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058014135330130424, + "language_loss": 0.82146972, + "learning_rate": 0.0006048292509534095, + "loss": 0.83241296, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.28344727, + "step": 2338, + "time_per_iteration": 2.6184592247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_mlp": 1.07211113, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.056454767026620875, + "language_loss": 0.77617335, + "learning_rate": 0.0006045246124434895, + "loss": 0.78716958, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.27539062, + "step": 2339, + "time_per_iteration": 2.7225115299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_mlp": 1.07309031, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.09896135571333878, + "language_loss": 0.86173731, + "learning_rate": 0.0006042199333535162, + "loss": 0.87274528, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.27709961, + "step": 2340, + "time_per_iteration": 3.274585008621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104864, + "balance_loss_mlp": 1.07768369, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05749680267159243, + "language_loss": 0.84251344, + "learning_rate": 0.0006039152138017763, + "loss": 0.85356206, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.27246094, + "step": 2341, + "time_per_iteration": 3.060763359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07796395, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.056134576893582644, + "language_loss": 0.83558077, + "learning_rate": 0.0006036104539065726, + "loss": 0.84663171, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.27172852, + "step": 2342, + "time_per_iteration": 2.7406816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108201, + "balance_loss_mlp": 1.08054459, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.061859527889038764, + "language_loss": 0.84472108, + "learning_rate": 0.000603305653786223, + "loss": 0.85580313, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.27685547, + "step": 2343, + "time_per_iteration": 3.197312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_mlp": 1.07354283, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.054371913691722666, + "language_loss": 0.83979696, + "learning_rate": 0.0006030008135590622, + "loss": 0.85080612, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.27416992, + "step": 2344, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097762, + "balance_loss_mlp": 1.07062995, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.05301123134364682, + "language_loss": 0.8020395, + "learning_rate": 0.0006026959333434387, + "loss": 0.81301707, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.27172852, + "step": 2345, + "time_per_iteration": 2.7582781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.0720278, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.056237590740745906, + "language_loss": 0.77273649, + "learning_rate": 0.0006023910132577181, + "loss": 0.78373116, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.2746582, + "step": 2346, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086046, + "balance_loss_mlp": 1.05915189, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.061957652789735564, + "language_loss": 0.84835315, + "learning_rate": 0.0006020860534202806, + "loss": 0.85921359, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.26953125, + "step": 2347, + "time_per_iteration": 2.5046098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010926, + "balance_loss_mlp": 1.06475294, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.05205934628014934, + "language_loss": 0.80817962, + "learning_rate": 0.0006017810539495224, + "loss": 0.81910563, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.27905273, + "step": 2348, + "time_per_iteration": 2.9269816875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.06642056, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.0701488599790333, + "language_loss": 0.82789373, + "learning_rate": 0.0006014760149638547, + "loss": 0.83883661, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.27880859, + "step": 2349, + "time_per_iteration": 2.725395441055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.05837011, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.05676126010630497, + "language_loss": 0.88258755, + "learning_rate": 0.000601170936581704, + "loss": 0.89344376, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.27270508, + "step": 2350, + "time_per_iteration": 2.5604915618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088839, + "balance_loss_mlp": 1.06101537, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.07551987134141444, + "language_loss": 0.84626472, + "learning_rate": 0.0006008658189215121, + "loss": 0.85715318, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.27832031, + "step": 2351, + "time_per_iteration": 2.6299045085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_mlp": 1.07158601, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.07553479525673996, + "language_loss": 0.79898262, + "learning_rate": 0.0006005606621017366, + "loss": 0.80998385, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.28540039, + "step": 2352, + "time_per_iteration": 2.58725905418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.06732249, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05769795994016392, + "language_loss": 0.8022939, + "learning_rate": 0.0006002554662408496, + "loss": 0.81325346, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.28637695, + "step": 2353, + "time_per_iteration": 2.9054527282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089231, + "balance_loss_mlp": 1.06078792, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.07238968138349489, + "language_loss": 0.91292691, + "learning_rate": 0.0005999502314573388, + "loss": 0.92381918, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.28393555, + "step": 2354, + "time_per_iteration": 2.6389734745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05656958, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.0719451372015111, + "language_loss": 0.86045247, + "learning_rate": 0.0005996449578697066, + "loss": 0.87130976, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.29174805, + "step": 2355, + "time_per_iteration": 2.6851072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_mlp": 1.06634867, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05612545408526447, + "language_loss": 0.81111002, + "learning_rate": 0.0005993396455964709, + "loss": 0.82205319, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.2800293, + "step": 2356, + "time_per_iteration": 2.6760780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095343, + "balance_loss_mlp": 1.06754375, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.05702970789361519, + "language_loss": 0.81782162, + "learning_rate": 0.0005990342947561647, + "loss": 0.82877505, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.27856445, + "step": 2357, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108513, + "balance_loss_mlp": 1.07949746, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.06168719534303639, + "language_loss": 0.77822679, + "learning_rate": 0.0005987289054673351, + "loss": 0.78931195, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.28979492, + "step": 2358, + "time_per_iteration": 2.6254196166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191784, + "balance_loss_mlp": 1.18038785, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.06020491976481073, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77767521, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11376953, + "step": 2359, + "time_per_iteration": 4.803730010986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112502, + "balance_loss_mlp": 1.08300948, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.06904936924963041, + "language_loss": 0.90802431, + "learning_rate": 0.0005981180120183722, + "loss": 0.91914928, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.29443359, + "step": 2360, + "time_per_iteration": 2.672501564025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_mlp": 1.08560812, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.18994365983189826, + "language_loss": 0.85107553, + "learning_rate": 0.0005978125080954089, + "loss": 0.86222672, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.29492188, + "step": 2361, + "time_per_iteration": 2.7426631450653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111841, + "balance_loss_mlp": 1.0814904, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.07946717837388541, + "language_loss": 0.76933616, + "learning_rate": 0.000597506966198262, + "loss": 0.78045452, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.30297852, + "step": 2362, + "time_per_iteration": 2.9498252868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113617, + "balance_loss_mlp": 1.08438706, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.08220053414262748, + "language_loss": 0.83964276, + "learning_rate": 0.0005972013864455536, + "loss": 0.85077894, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.29199219, + "step": 2363, + "time_per_iteration": 2.623084545135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.0844152, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.07689777421943021, + "language_loss": 0.84891784, + "learning_rate": 0.0005968957689559203, + "loss": 0.86004549, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.28369141, + "step": 2364, + "time_per_iteration": 4.15172266960144 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_mlp": 1.07492638, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.0791653109712497, + "language_loss": 0.88481373, + "learning_rate": 0.0005965901138480131, + "loss": 0.89584458, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.28173828, + "step": 2365, + "time_per_iteration": 2.5800631046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097109, + "balance_loss_mlp": 1.06840384, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.06578783357270249, + "language_loss": 0.87197572, + "learning_rate": 0.0005962844212404982, + "loss": 0.88294685, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.28686523, + "step": 2366, + "time_per_iteration": 2.6940040588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091654, + "balance_loss_mlp": 1.06344962, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.05998271622094208, + "language_loss": 0.86890531, + "learning_rate": 0.0005959786912520558, + "loss": 0.87982178, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.2824707, + "step": 2367, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096727, + "balance_loss_mlp": 1.06854558, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04792571197867491, + "language_loss": 0.83765805, + "learning_rate": 0.0005956729240013806, + "loss": 0.8486253, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.28173828, + "step": 2368, + "time_per_iteration": 2.8546009063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.08035553, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.054790339147135006, + "language_loss": 0.91898453, + "learning_rate": 0.0005953671196071824, + "loss": 0.93007344, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.28540039, + "step": 2369, + "time_per_iteration": 2.7034096717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115288, + "balance_loss_mlp": 1.08767939, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05736115779957956, + "language_loss": 0.79610699, + "learning_rate": 0.0005950612781881846, + "loss": 0.8072598, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.27636719, + "step": 2370, + "time_per_iteration": 2.707674264907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124856, + "balance_loss_mlp": 1.09662771, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.08139155344435882, + "language_loss": 0.75630575, + "learning_rate": 0.0005947553998631259, + "loss": 0.76755428, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.2824707, + "step": 2371, + "time_per_iteration": 2.8811731338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125619, + "balance_loss_mlp": 1.09770048, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.07117752980456016, + "language_loss": 0.79090154, + "learning_rate": 0.000594449484750758, + "loss": 0.80215776, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.27905273, + "step": 2372, + "time_per_iteration": 3.1549901962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08807683, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.061849801440599636, + "language_loss": 0.82697588, + "learning_rate": 0.0005941435329698484, + "loss": 0.83814585, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.2890625, + "step": 2373, + "time_per_iteration": 2.6593072414398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118584, + "balance_loss_mlp": 1.09054554, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.06278217801879041, + "language_loss": 0.83130741, + "learning_rate": 0.0005938375446391778, + "loss": 0.8424933, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.28051758, + "step": 2374, + "time_per_iteration": 2.7434608936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.09563541, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06820583935841042, + "language_loss": 0.89043015, + "learning_rate": 0.0005935315198775415, + "loss": 0.90167212, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.28540039, + "step": 2375, + "time_per_iteration": 2.6057205200195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_mlp": 1.08558059, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.07601718344596131, + "language_loss": 0.87262166, + "learning_rate": 0.0005932254588037486, + "loss": 0.88375497, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.27783203, + "step": 2376, + "time_per_iteration": 2.4881751537323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103499, + "balance_loss_mlp": 1.07462692, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.07182864232109534, + "language_loss": 0.86405516, + "learning_rate": 0.000592919361536623, + "loss": 0.87509012, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.28857422, + "step": 2377, + "time_per_iteration": 2.6453545093536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07376885, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06032083182665244, + "language_loss": 0.88920552, + "learning_rate": 0.0005926132281950017, + "loss": 0.90022385, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.28076172, + "step": 2378, + "time_per_iteration": 2.7356886863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.0672735, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.07556174313152972, + "language_loss": 0.8485238, + "learning_rate": 0.0005923070588977367, + "loss": 0.8594898, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.29248047, + "step": 2379, + "time_per_iteration": 2.812110185623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095202, + "balance_loss_mlp": 1.0665921, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.0597594421207511, + "language_loss": 0.86065739, + "learning_rate": 0.0005920008537636931, + "loss": 0.87160945, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.28613281, + "step": 2380, + "time_per_iteration": 2.8955793380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094751, + "balance_loss_mlp": 1.06518722, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.08202954174104495, + "language_loss": 0.86535549, + "learning_rate": 0.0005916946129117504, + "loss": 0.87630302, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.29516602, + "step": 2381, + "time_per_iteration": 2.8850152492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.05958724, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06022733145419036, + "language_loss": 0.80483937, + "learning_rate": 0.0005913883364608017, + "loss": 0.81573421, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.29833984, + "step": 2382, + "time_per_iteration": 3.0977792739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092347, + "balance_loss_mlp": 1.06225872, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.07912283694355432, + "language_loss": 0.88849449, + "learning_rate": 0.0005910820245297542, + "loss": 0.899418, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.30053711, + "step": 2383, + "time_per_iteration": 2.905977964401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081098, + "balance_loss_mlp": 1.05055714, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06971122212551431, + "language_loss": 0.810808, + "learning_rate": 0.000590775677237529, + "loss": 0.82161897, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.30517578, + "step": 2384, + "time_per_iteration": 2.7233986854553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078055, + "balance_loss_mlp": 1.04810929, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.10145803635005178, + "language_loss": 0.79860461, + "learning_rate": 0.0005904692947030601, + "loss": 0.80938518, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.29882812, + "step": 2385, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04647207, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.08299143875661358, + "language_loss": 0.89372921, + "learning_rate": 0.0005901628770452963, + "loss": 0.90449417, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.29956055, + "step": 2386, + "time_per_iteration": 2.56011700630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_mlp": 1.04586029, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05953614440228025, + "language_loss": 0.87499726, + "learning_rate": 0.000589856424383199, + "loss": 0.88575506, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.29882812, + "step": 2387, + "time_per_iteration": 2.622857093811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.04762435, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.06461384040637212, + "language_loss": 0.8283028, + "learning_rate": 0.000589549936835744, + "loss": 0.83908516, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.30566406, + "step": 2388, + "time_per_iteration": 2.9280176162719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082083, + "balance_loss_mlp": 1.0514698, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.07025219360641571, + "language_loss": 0.79160953, + "learning_rate": 0.0005892434145219202, + "loss": 0.80243033, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.30566406, + "step": 2389, + "time_per_iteration": 2.632772207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081464, + "balance_loss_mlp": 1.050946, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.060348492919292666, + "language_loss": 0.82535923, + "learning_rate": 0.0005889368575607303, + "loss": 0.83617389, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.3046875, + "step": 2390, + "time_per_iteration": 2.815487861633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094579, + "balance_loss_mlp": 1.06358492, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05491617941274289, + "language_loss": 0.78348118, + "learning_rate": 0.00058863026607119, + "loss": 0.79442704, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.30957031, + "step": 2391, + "time_per_iteration": 3.0853166580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0620811, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.05825671270919626, + "language_loss": 0.79661655, + "learning_rate": 0.0005883236401723287, + "loss": 0.80753851, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.30078125, + "step": 2392, + "time_per_iteration": 3.1643104553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096169, + "balance_loss_mlp": 1.06536531, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.06457998167472197, + "language_loss": 0.84046978, + "learning_rate": 0.0005880169799831893, + "loss": 0.85143149, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.30761719, + "step": 2393, + "time_per_iteration": 2.6935391426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096173, + "balance_loss_mlp": 1.0654645, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.06354744392782355, + "language_loss": 0.81838334, + "learning_rate": 0.0005877102856228278, + "loss": 0.82934511, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.30664062, + "step": 2394, + "time_per_iteration": 2.8314805030822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097821, + "balance_loss_mlp": 1.06713629, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0665210460005036, + "language_loss": 0.84696203, + "learning_rate": 0.0005874035572103133, + "loss": 0.8579402, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.30664062, + "step": 2395, + "time_per_iteration": 2.6893725395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098408, + "balance_loss_mlp": 1.0673902, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.1082823786036068, + "language_loss": 0.82554322, + "learning_rate": 0.0005870967948647288, + "loss": 0.83652729, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.30981445, + "step": 2396, + "time_per_iteration": 2.7625200748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191183, + "balance_loss_mlp": 1.1745894, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.05861502253959749, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75499487, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.16601562, + "step": 2397, + "time_per_iteration": 5.363407850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090965, + "balance_loss_mlp": 1.06028056, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.08876233940236913, + "language_loss": 0.85477209, + "learning_rate": 0.0005864831688507443, + "loss": 0.86568171, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.30639648, + "step": 2398, + "time_per_iteration": 2.9619805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081398, + "balance_loss_mlp": 1.05119061, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.06931834879873142, + "language_loss": 0.75342947, + "learning_rate": 0.0005861763054205754, + "loss": 0.76424348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.30151367, + "step": 2399, + "time_per_iteration": 2.7531988620758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091818, + "balance_loss_mlp": 1.06213522, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.05751461156756605, + "language_loss": 0.80467141, + "learning_rate": 0.0005858694085337976, + "loss": 0.81558955, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.29614258, + "step": 2400, + "time_per_iteration": 2.814182758331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083104, + "balance_loss_mlp": 1.05246735, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.07664119673877032, + "language_loss": 0.8354007, + "learning_rate": 0.0005855624783095589, + "loss": 0.8462317, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.30615234, + "step": 2401, + "time_per_iteration": 2.57083797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_mlp": 1.05414128, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.06712435829168825, + "language_loss": 0.85380065, + "learning_rate": 0.00058525551486702, + "loss": 0.864636, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.29370117, + "step": 2402, + "time_per_iteration": 2.554870843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_mlp": 1.05476141, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06447976336023753, + "language_loss": 0.80940902, + "learning_rate": 0.0005849485183253548, + "loss": 0.82025588, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.29882812, + "step": 2403, + "time_per_iteration": 2.6398868560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.05546916, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.07099246909711197, + "language_loss": 0.87546206, + "learning_rate": 0.0005846414888037501, + "loss": 0.88631094, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.29345703, + "step": 2404, + "time_per_iteration": 2.5056095123291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086728, + "balance_loss_mlp": 1.05725932, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.052798237228442416, + "language_loss": 0.82345319, + "learning_rate": 0.0005843344264214049, + "loss": 0.83432049, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.29443359, + "step": 2405, + "time_per_iteration": 2.7549078464508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.06176221, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.05337180485738099, + "language_loss": 0.84920704, + "learning_rate": 0.0005840273312975317, + "loss": 0.8601203, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.29516602, + "step": 2406, + "time_per_iteration": 2.9058027267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085122, + "balance_loss_mlp": 1.05577278, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.05333458165520064, + "language_loss": 0.89626014, + "learning_rate": 0.0005837202035513555, + "loss": 0.90711135, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.29345703, + "step": 2407, + "time_per_iteration": 2.5721802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094311, + "balance_loss_mlp": 1.06531978, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.0552743160267319, + "language_loss": 0.81124538, + "learning_rate": 0.0005834130433021136, + "loss": 0.8221885, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.28930664, + "step": 2408, + "time_per_iteration": 2.7402079105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.06166446, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.09526074365649402, + "language_loss": 0.73246038, + "learning_rate": 0.0005831058506690563, + "loss": 0.74337649, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.29931641, + "step": 2409, + "time_per_iteration": 2.6229617595672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088655, + "balance_loss_mlp": 1.05875707, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.061078353708003665, + "language_loss": 0.85864687, + "learning_rate": 0.0005827986257714464, + "loss": 0.86953342, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.29858398, + "step": 2410, + "time_per_iteration": 2.9352338314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094131, + "balance_loss_mlp": 1.06404257, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.05695764594036898, + "language_loss": 0.88375425, + "learning_rate": 0.0005824913687285591, + "loss": 0.89469558, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.30078125, + "step": 2411, + "time_per_iteration": 2.6807737350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097526, + "balance_loss_mlp": 1.06698477, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.0643729084989199, + "language_loss": 0.81849819, + "learning_rate": 0.0005821840796596821, + "loss": 0.82947344, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.30493164, + "step": 2412, + "time_per_iteration": 2.663177967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096211, + "balance_loss_mlp": 1.0657649, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.07601159389817994, + "language_loss": 0.80307502, + "learning_rate": 0.0005818767586841158, + "loss": 0.81403708, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.30419922, + "step": 2413, + "time_per_iteration": 2.7600111961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092616, + "balance_loss_mlp": 1.06233692, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.059484167412089096, + "language_loss": 0.86110759, + "learning_rate": 0.0005815694059211726, + "loss": 0.87203372, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.30249023, + "step": 2414, + "time_per_iteration": 2.65578031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148176, + "balance_loss_mlp": 1.13263142, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.0462911781552321, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82021809, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.15527344, + "step": 2415, + "time_per_iteration": 4.8046934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_mlp": 1.10092187, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.038481348382240925, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78060573, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.14550781, + "step": 2416, + "time_per_iteration": 4.977246999740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.05554748, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.07046148078843767, + "language_loss": 0.85802382, + "learning_rate": 0.0005806471581013931, + "loss": 0.86888373, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.30395508, + "step": 2417, + "time_per_iteration": 2.7680604457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_mlp": 1.05363095, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.061868019756872866, + "language_loss": 0.78540701, + "learning_rate": 0.0005803396793823146, + "loss": 0.7962473, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.30371094, + "step": 2418, + "time_per_iteration": 2.818821430206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081583, + "balance_loss_mlp": 1.05213845, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.08069009721002836, + "language_loss": 0.8594386, + "learning_rate": 0.0005800321694726065, + "loss": 0.8702544, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.29418945, + "step": 2419, + "time_per_iteration": 2.812563896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_mlp": 1.05454159, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.061646313113324705, + "language_loss": 0.86883628, + "learning_rate": 0.0005797246284916545, + "loss": 0.87968636, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.30444336, + "step": 2420, + "time_per_iteration": 2.6945559978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_mlp": 1.02332675, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.024509703594541715, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78539675, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.11181641, + "step": 2421, + "time_per_iteration": 5.001375436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089527, + "balance_loss_mlp": 1.06036878, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.07023208249232396, + "language_loss": 0.8781141, + "learning_rate": 0.0005791094537936233, + "loss": 0.88900936, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.29150391, + "step": 2422, + "time_per_iteration": 2.703678846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010888, + "balance_loss_mlp": 1.06028509, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.06283657209164231, + "language_loss": 0.817285, + "learning_rate": 0.0005788018203153762, + "loss": 0.82817304, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.28515625, + "step": 2423, + "time_per_iteration": 2.6398653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081237, + "balance_loss_mlp": 1.05255485, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.0646507393923986, + "language_loss": 0.85720015, + "learning_rate": 0.000578494156243549, + "loss": 0.86801249, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.28686523, + "step": 2424, + "time_per_iteration": 2.6061441898345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086736, + "balance_loss_mlp": 1.05695724, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.05149395612804314, + "language_loss": 0.89174867, + "learning_rate": 0.0005781864616975878, + "loss": 0.90261602, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.29736328, + "step": 2425, + "time_per_iteration": 2.7073817253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05917215, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.0742004751674347, + "language_loss": 0.84101117, + "learning_rate": 0.0005778787367969502, + "loss": 0.85188806, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.28515625, + "step": 2426, + "time_per_iteration": 2.643342971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082589, + "balance_loss_mlp": 1.05374038, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.05195761556147334, + "language_loss": 0.80815637, + "learning_rate": 0.0005775709816611053, + "loss": 0.81898224, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.28857422, + "step": 2427, + "time_per_iteration": 3.0103423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085111, + "balance_loss_mlp": 1.05604792, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05192902090033842, + "language_loss": 0.83742678, + "learning_rate": 0.0005772631964095346, + "loss": 0.84827781, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.29003906, + "step": 2428, + "time_per_iteration": 4.2191994190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010894, + "balance_loss_mlp": 1.06107569, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.05894584384100732, + "language_loss": 0.85613596, + "learning_rate": 0.000576955381161731, + "loss": 0.86702996, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.28320312, + "step": 2429, + "time_per_iteration": 2.7035927772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.05297327, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.07711305585297333, + "language_loss": 0.8606714, + "learning_rate": 0.0005766475360371985, + "loss": 0.87149525, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.29394531, + "step": 2430, + "time_per_iteration": 2.5702948570251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092231, + "balance_loss_mlp": 1.06292963, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.08342834969675962, + "language_loss": 0.84959614, + "learning_rate": 0.0005763396611554536, + "loss": 0.86051846, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.29248047, + "step": 2431, + "time_per_iteration": 2.6236841678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092277, + "balance_loss_mlp": 1.06383383, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.06223220956170435, + "language_loss": 0.80269897, + "learning_rate": 0.0005760317566360237, + "loss": 0.81362176, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.28466797, + "step": 2432, + "time_per_iteration": 3.0205023288726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_mlp": 1.0559535, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.058294757950733474, + "language_loss": 0.85130137, + "learning_rate": 0.000575723822598448, + "loss": 0.86214417, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.28295898, + "step": 2433, + "time_per_iteration": 2.79516339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086726, + "balance_loss_mlp": 1.05866385, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.06256497191901454, + "language_loss": 0.81601393, + "learning_rate": 0.0005754158591622773, + "loss": 0.82688123, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.28076172, + "step": 2434, + "time_per_iteration": 2.963247537612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092504, + "balance_loss_mlp": 1.06365538, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.08333045297400817, + "language_loss": 0.8228929, + "learning_rate": 0.0005751078664470732, + "loss": 0.83381796, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.28833008, + "step": 2435, + "time_per_iteration": 2.537179470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_mlp": 1.05688024, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.08080859282065189, + "language_loss": 0.85670036, + "learning_rate": 0.0005747998445724094, + "loss": 0.86755049, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.28125, + "step": 2436, + "time_per_iteration": 2.6276183128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083485, + "balance_loss_mlp": 1.05466008, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.08810611044699188, + "language_loss": 0.89099967, + "learning_rate": 0.0005744917936578707, + "loss": 0.90183449, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.28808594, + "step": 2437, + "time_per_iteration": 2.784236431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085755, + "balance_loss_mlp": 1.05690634, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.08777270325229546, + "language_loss": 0.83928555, + "learning_rate": 0.0005741837138230526, + "loss": 0.85014307, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.28808594, + "step": 2438, + "time_per_iteration": 2.7139840126037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078469, + "balance_loss_mlp": 1.05014467, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.053438427497709357, + "language_loss": 0.86270201, + "learning_rate": 0.0005738756051875627, + "loss": 0.87348676, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.28295898, + "step": 2439, + "time_per_iteration": 3.092337131500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074485, + "balance_loss_mlp": 1.04551697, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.056335724754341315, + "language_loss": 0.83459938, + "learning_rate": 0.0005735674678710192, + "loss": 0.84534419, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.28930664, + "step": 2440, + "time_per_iteration": 2.6729819774627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107755, + "balance_loss_mlp": 1.0473665, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.06862136292067082, + "language_loss": 0.80992246, + "learning_rate": 0.0005732593019930517, + "loss": 0.82069802, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.30126953, + "step": 2441, + "time_per_iteration": 2.917332649230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078244, + "balance_loss_mlp": 1.04779828, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.06788307957029095, + "language_loss": 0.8767302, + "learning_rate": 0.0005729511076733008, + "loss": 0.88751262, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.30395508, + "step": 2442, + "time_per_iteration": 2.6602578163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108041, + "balance_loss_mlp": 1.05003536, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.08414136163770505, + "language_loss": 0.84802854, + "learning_rate": 0.000572642885031418, + "loss": 0.85883266, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.30322266, + "step": 2443, + "time_per_iteration": 2.924572706222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075591, + "balance_loss_mlp": 1.04516852, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.055800438037163856, + "language_loss": 0.80518812, + "learning_rate": 0.0005723346341870662, + "loss": 0.81594402, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.30371094, + "step": 2444, + "time_per_iteration": 2.7203280925750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05217505, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.06929087535104682, + "language_loss": 0.86297798, + "learning_rate": 0.0005720263552599188, + "loss": 0.87380457, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.30444336, + "step": 2445, + "time_per_iteration": 2.469621419906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075882, + "balance_loss_mlp": 1.0456984, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.06843850090218344, + "language_loss": 0.79142129, + "learning_rate": 0.0005717180483696604, + "loss": 0.80218005, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.30151367, + "step": 2446, + "time_per_iteration": 2.9089763164520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072219, + "balance_loss_mlp": 1.04034209, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.07381367232784701, + "language_loss": 0.83118802, + "learning_rate": 0.0005714097136359862, + "loss": 0.84191024, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.31860352, + "step": 2447, + "time_per_iteration": 2.6346585750579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04817808, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.06979677359463858, + "language_loss": 0.86324209, + "learning_rate": 0.0005711013511786027, + "loss": 0.87403476, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.31054688, + "step": 2448, + "time_per_iteration": 2.765740156173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073046, + "balance_loss_mlp": 1.0426712, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.048536468835106476, + "language_loss": 0.84014428, + "learning_rate": 0.0005707929611172263, + "loss": 0.85087478, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3034668, + "step": 2449, + "time_per_iteration": 2.6891775131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074493, + "balance_loss_mlp": 1.04349887, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05569215031080998, + "language_loss": 0.83788037, + "learning_rate": 0.000570484543571585, + "loss": 0.84862536, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.30957031, + "step": 2450, + "time_per_iteration": 2.545646905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076975, + "balance_loss_mlp": 1.04743469, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.06210999897734131, + "language_loss": 0.82771122, + "learning_rate": 0.0005701760986614171, + "loss": 0.83848095, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.29492188, + "step": 2451, + "time_per_iteration": 2.5739784240722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080958, + "balance_loss_mlp": 1.05256283, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.06034093462601522, + "language_loss": 0.87343812, + "learning_rate": 0.0005698676265064714, + "loss": 0.88424772, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.28393555, + "step": 2452, + "time_per_iteration": 2.5456669330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_mlp": 1.05612302, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.12010658803535784, + "language_loss": 0.88854802, + "learning_rate": 0.0005695591272265074, + "loss": 0.89940351, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.29370117, + "step": 2453, + "time_per_iteration": 2.53247332572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.05610394, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.06319040539886057, + "language_loss": 0.81670743, + "learning_rate": 0.0005692506009412954, + "loss": 0.8275677, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.29907227, + "step": 2454, + "time_per_iteration": 2.663959503173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157874, + "balance_loss_mlp": 1.14423668, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.046124065416459865, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78709137, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.13671875, + "step": 2455, + "time_per_iteration": 4.937524795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085858, + "balance_loss_mlp": 1.05603182, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.07174058927835297, + "language_loss": 0.89622641, + "learning_rate": 0.0005686334678342593, + "loss": 0.907085, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.2980957, + "step": 2456, + "time_per_iteration": 2.9060487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077496, + "balance_loss_mlp": 1.04824257, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.07069871267474889, + "language_loss": 0.81667411, + "learning_rate": 0.0005683248612520274, + "loss": 0.82744908, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.29223633, + "step": 2457, + "time_per_iteration": 3.071544885635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_mlp": 1.05465865, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.07071545002601118, + "language_loss": 0.83683658, + "learning_rate": 0.0005680162281437321, + "loss": 0.84768021, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.296875, + "step": 2458, + "time_per_iteration": 2.931579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077685, + "balance_loss_mlp": 1.0476439, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.06018673388195985, + "language_loss": 0.84837544, + "learning_rate": 0.000567707568629195, + "loss": 0.85915226, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.30004883, + "step": 2459, + "time_per_iteration": 2.6860852241516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079226, + "balance_loss_mlp": 1.04968619, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.053752412093893094, + "language_loss": 0.82513988, + "learning_rate": 0.0005673988828282486, + "loss": 0.83593214, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.29467773, + "step": 2460, + "time_per_iteration": 2.6679980754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.04320669, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.05735836881189746, + "language_loss": 0.80829632, + "learning_rate": 0.0005670901708607352, + "loss": 0.81903076, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.30175781, + "step": 2461, + "time_per_iteration": 2.962364673614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076898, + "balance_loss_mlp": 1.04635668, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06660215000338995, + "language_loss": 0.84026098, + "learning_rate": 0.0005667814328465076, + "loss": 0.85102999, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.30493164, + "step": 2462, + "time_per_iteration": 2.6148030757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077856, + "balance_loss_mlp": 1.04824424, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.0820641824195461, + "language_loss": 0.81702316, + "learning_rate": 0.0005664726689054285, + "loss": 0.8278017, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.29541016, + "step": 2463, + "time_per_iteration": 2.46337628364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.04910851, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.07270387927239072, + "language_loss": 0.81341946, + "learning_rate": 0.0005661638791573704, + "loss": 0.82421935, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.30859375, + "step": 2464, + "time_per_iteration": 2.712188720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_mlp": 1.05453193, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.05714322793938323, + "language_loss": 0.87222457, + "learning_rate": 0.0005658550637222164, + "loss": 0.88307238, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.30224609, + "step": 2465, + "time_per_iteration": 2.63380765914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082927, + "balance_loss_mlp": 1.05298185, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.06339144108901118, + "language_loss": 0.82493532, + "learning_rate": 0.0005655462227198592, + "loss": 0.83576465, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.29907227, + "step": 2466, + "time_per_iteration": 2.910783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_mlp": 1.0547595, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05460968765214119, + "language_loss": 0.83975738, + "learning_rate": 0.0005652373562702016, + "loss": 0.85060585, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.30053711, + "step": 2467, + "time_per_iteration": 2.6101505756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.05072081, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.06618054462006194, + "language_loss": 0.88145614, + "learning_rate": 0.000564928464493156, + "loss": 0.89226621, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.30249023, + "step": 2468, + "time_per_iteration": 2.55812668800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081635, + "balance_loss_mlp": 1.05247641, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.06741069565287812, + "language_loss": 0.81633413, + "learning_rate": 0.000564619547508645, + "loss": 0.82715052, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.29150391, + "step": 2469, + "time_per_iteration": 3.1341404914855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082878, + "balance_loss_mlp": 1.05252695, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.0651779420020333, + "language_loss": 0.83088791, + "learning_rate": 0.0005643106054366008, + "loss": 0.84171665, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.30297852, + "step": 2470, + "time_per_iteration": 2.610891342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.04666018, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.0714119485898344, + "language_loss": 0.79053152, + "learning_rate": 0.000564001638396965, + "loss": 0.80129188, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.29321289, + "step": 2471, + "time_per_iteration": 2.7754971981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083604, + "balance_loss_mlp": 1.05430186, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05565021284268994, + "language_loss": 0.8203246, + "learning_rate": 0.0005636926465096897, + "loss": 0.83116066, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.29248047, + "step": 2472, + "time_per_iteration": 3.028235912322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079414, + "balance_loss_mlp": 1.05116105, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.06838176056824781, + "language_loss": 0.87627274, + "learning_rate": 0.0005633836298947363, + "loss": 0.8870669, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.28271484, + "step": 2473, + "time_per_iteration": 2.609142303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04901338, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.06111056533479294, + "language_loss": 0.70809621, + "learning_rate": 0.000563074588672075, + "loss": 0.71887386, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.28759766, + "step": 2474, + "time_per_iteration": 2.722593069076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_mlp": 1.05080247, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.06296236889432077, + "language_loss": 0.85321903, + "learning_rate": 0.0005627655229616868, + "loss": 0.8640129, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.28540039, + "step": 2475, + "time_per_iteration": 2.711296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081174, + "balance_loss_mlp": 1.05141973, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.06122384611792148, + "language_loss": 0.89890903, + "learning_rate": 0.0005624564328835616, + "loss": 0.90972078, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.29736328, + "step": 2476, + "time_per_iteration": 2.796614408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05069184, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.05962569805242902, + "language_loss": 0.84079456, + "learning_rate": 0.0005621473185576986, + "loss": 0.85158479, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.28344727, + "step": 2477, + "time_per_iteration": 2.7140815258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.05709434, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.07093607725441804, + "language_loss": 0.87060082, + "learning_rate": 0.0005618381801041068, + "loss": 0.88146281, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.29077148, + "step": 2478, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085469, + "balance_loss_mlp": 1.05638218, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.07057707739429774, + "language_loss": 0.83022285, + "learning_rate": 0.0005615290176428044, + "loss": 0.84107757, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.29052734, + "step": 2479, + "time_per_iteration": 2.6407430171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108759, + "balance_loss_mlp": 1.05828834, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06449831218896054, + "language_loss": 0.85197705, + "learning_rate": 0.0005612198312938187, + "loss": 0.86285299, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.29296875, + "step": 2480, + "time_per_iteration": 2.7345011234283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108973, + "balance_loss_mlp": 1.06121504, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.060218704260060575, + "language_loss": 0.79185855, + "learning_rate": 0.0005609106211771868, + "loss": 0.80275583, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.28540039, + "step": 2481, + "time_per_iteration": 2.8754329681396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.05908394, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07327776648741448, + "language_loss": 0.89180911, + "learning_rate": 0.0005606013874129543, + "loss": 0.90269172, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.29199219, + "step": 2482, + "time_per_iteration": 2.7726404666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090058, + "balance_loss_mlp": 1.06049454, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.06456332848164101, + "language_loss": 0.79976207, + "learning_rate": 0.0005602921301211768, + "loss": 0.81066263, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.29516602, + "step": 2483, + "time_per_iteration": 2.715306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089436, + "balance_loss_mlp": 1.06132603, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07998801300028703, + "language_loss": 0.82180744, + "learning_rate": 0.0005599828494219185, + "loss": 0.83270174, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.28100586, + "step": 2484, + "time_per_iteration": 2.5683019161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086424, + "balance_loss_mlp": 1.05836201, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.06543459725570545, + "language_loss": 0.88914174, + "learning_rate": 0.0005596735454352527, + "loss": 0.90000606, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.28076172, + "step": 2485, + "time_per_iteration": 2.8615424633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083119, + "balance_loss_mlp": 1.05531943, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07228586186756063, + "language_loss": 0.85170126, + "learning_rate": 0.0005593642182812619, + "loss": 0.8625325, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.27856445, + "step": 2486, + "time_per_iteration": 2.6507115364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_mlp": 1.0574224, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.06671866930909515, + "language_loss": 0.83972216, + "learning_rate": 0.0005590548680800378, + "loss": 0.85056645, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.27050781, + "step": 2487, + "time_per_iteration": 3.0963587760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085422, + "balance_loss_mlp": 1.05755091, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.0627787894989405, + "language_loss": 0.7639966, + "learning_rate": 0.0005587454949516804, + "loss": 0.77485085, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.27880859, + "step": 2488, + "time_per_iteration": 2.704761266708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085753, + "balance_loss_mlp": 1.05719018, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.07191070894190046, + "language_loss": 0.87996674, + "learning_rate": 0.0005584360990162993, + "loss": 0.89082426, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.28540039, + "step": 2489, + "time_per_iteration": 2.68680477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_mlp": 1.05921531, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.052754850289178916, + "language_loss": 0.85114515, + "learning_rate": 0.0005581266803940124, + "loss": 0.86201936, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.28222656, + "step": 2490, + "time_per_iteration": 2.7187392711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.06322539, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.061347112520969346, + "language_loss": 0.87164974, + "learning_rate": 0.0005578172392049471, + "loss": 0.8825624, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.28051758, + "step": 2491, + "time_per_iteration": 2.7291457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089047, + "balance_loss_mlp": 1.06048441, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.07263845202824909, + "language_loss": 0.84244549, + "learning_rate": 0.0005575077755692386, + "loss": 0.85333598, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.28564453, + "step": 2492, + "time_per_iteration": 2.8026599884033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080078, + "balance_loss_mlp": 1.05246925, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0504022340685432, + "language_loss": 0.85800493, + "learning_rate": 0.0005571982896070316, + "loss": 0.86880577, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.27612305, + "step": 2493, + "time_per_iteration": 2.655550003051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080752, + "balance_loss_mlp": 1.05266619, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.11668407926682704, + "language_loss": 0.89753431, + "learning_rate": 0.0005568887814384792, + "loss": 0.90834183, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.28100586, + "step": 2494, + "time_per_iteration": 2.5966434478759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080843, + "balance_loss_mlp": 1.05337763, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.058142169565221447, + "language_loss": 0.87224984, + "learning_rate": 0.000556579251183743, + "loss": 0.88305831, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.27490234, + "step": 2495, + "time_per_iteration": 2.6536028385162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080101, + "balance_loss_mlp": 1.05089474, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06356237967295801, + "language_loss": 0.7994827, + "learning_rate": 0.0005562696989629936, + "loss": 0.81028366, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.29174805, + "step": 2496, + "time_per_iteration": 2.691530466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_mlp": 1.05328333, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.07544069195311896, + "language_loss": 0.82662058, + "learning_rate": 0.0005559601248964095, + "loss": 0.83744615, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.29223633, + "step": 2497, + "time_per_iteration": 2.687108278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078067, + "balance_loss_mlp": 1.04931426, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.07160134617119021, + "language_loss": 0.85915172, + "learning_rate": 0.0005556505291041783, + "loss": 0.86993241, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.28735352, + "step": 2498, + "time_per_iteration": 2.7002923488616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.05264211, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.21407023754506424, + "language_loss": 0.84214193, + "learning_rate": 0.0005553409117064954, + "loss": 0.85295641, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.2878418, + "step": 2499, + "time_per_iteration": 2.877713203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096264, + "balance_loss_mlp": 1.06824946, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.06103635462331165, + "language_loss": 0.84855151, + "learning_rate": 0.0005550312728235654, + "loss": 0.85951412, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.28051758, + "step": 2500, + "time_per_iteration": 2.716524362564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094238, + "balance_loss_mlp": 1.06610465, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.07633647670380422, + "language_loss": 0.83599609, + "learning_rate": 0.0005547216125756003, + "loss": 0.84693843, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.28125, + "step": 2501, + "time_per_iteration": 2.8102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097276, + "balance_loss_mlp": 1.06899917, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.05816521463755192, + "language_loss": 0.81801546, + "learning_rate": 0.0005544119310828211, + "loss": 0.82898819, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.28295898, + "step": 2502, + "time_per_iteration": 3.09083890914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110256, + "balance_loss_mlp": 1.08162141, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.07468975257849066, + "language_loss": 0.84463918, + "learning_rate": 0.0005541022284654568, + "loss": 0.85574174, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.28613281, + "step": 2503, + "time_per_iteration": 2.959812641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105243, + "balance_loss_mlp": 1.07613182, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.06287004960739773, + "language_loss": 0.83878344, + "learning_rate": 0.0005537925048437446, + "loss": 0.84983587, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.29077148, + "step": 2504, + "time_per_iteration": 2.5965919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113897, + "balance_loss_mlp": 1.12542796, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.039351692623908835, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76890433, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.13574219, + "step": 2505, + "time_per_iteration": 4.965132713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112409, + "balance_loss_mlp": 1.08420432, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.06703534425937603, + "language_loss": 0.88412756, + "learning_rate": 0.0005531729950682664, + "loss": 0.89525163, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.28198242, + "step": 2506, + "time_per_iteration": 3.032463550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107907, + "balance_loss_mlp": 1.07936859, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.08139997578259908, + "language_loss": 0.84598732, + "learning_rate": 0.000552863209155015, + "loss": 0.85706639, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.28564453, + "step": 2507, + "time_per_iteration": 2.501650333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101488, + "balance_loss_mlp": 1.07285357, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.06119014713123412, + "language_loss": 0.81909472, + "learning_rate": 0.0005525534027184461, + "loss": 0.83010966, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.28637695, + "step": 2508, + "time_per_iteration": 2.5787370204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098365, + "balance_loss_mlp": 1.06942117, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.05313984540081721, + "language_loss": 0.82654703, + "learning_rate": 0.0005522435758788365, + "loss": 0.83753073, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.28930664, + "step": 2509, + "time_per_iteration": 2.7109761238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010953, + "balance_loss_mlp": 1.06730938, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.05877851050813853, + "language_loss": 0.80259538, + "learning_rate": 0.0005519337287564721, + "loss": 0.81354833, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.2800293, + "step": 2510, + "time_per_iteration": 2.8329310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109601, + "balance_loss_mlp": 1.06759048, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.060327319620096846, + "language_loss": 0.83688086, + "learning_rate": 0.000551623861471646, + "loss": 0.84784102, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.28417969, + "step": 2511, + "time_per_iteration": 2.7470946311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_mlp": 1.08784056, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.03397215547055983, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79919541, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.12890625, + "step": 2512, + "time_per_iteration": 4.837340593338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095094, + "balance_loss_mlp": 1.06619751, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.059215268588021376, + "language_loss": 0.86540532, + "learning_rate": 0.0005510040668958211, + "loss": 0.87635624, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.2890625, + "step": 2513, + "time_per_iteration": 2.5706045627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.06364644, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.0265804362292035, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78836721, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.12451172, + "step": 2514, + "time_per_iteration": 4.899883508682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.0589062, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05909251781800444, + "language_loss": 0.83435559, + "learning_rate": 0.0005503841931138645, + "loss": 0.84523714, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.29272461, + "step": 2515, + "time_per_iteration": 2.665804386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.06112456, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.06787127022085944, + "language_loss": 0.81963372, + "learning_rate": 0.0005500742268214025, + "loss": 0.8305335, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.28833008, + "step": 2516, + "time_per_iteration": 2.5123801231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.05487967, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.05799188255481874, + "language_loss": 0.85305762, + "learning_rate": 0.0005497642410884014, + "loss": 0.86390138, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.29492188, + "step": 2517, + "time_per_iteration": 2.818969249725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107799, + "balance_loss_mlp": 1.04907012, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.0575391439282783, + "language_loss": 0.85093868, + "learning_rate": 0.0005494542360352085, + "loss": 0.8617186, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.28881836, + "step": 2518, + "time_per_iteration": 2.654691457748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081359, + "balance_loss_mlp": 1.05220056, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.06803778984218942, + "language_loss": 0.85824656, + "learning_rate": 0.0005491442117821783, + "loss": 0.86906004, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.29125977, + "step": 2519, + "time_per_iteration": 2.703547954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.0510273, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.12066852374350216, + "language_loss": 0.87487119, + "learning_rate": 0.0005488341684496732, + "loss": 0.88568664, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.3046875, + "step": 2520, + "time_per_iteration": 2.6539435386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107692, + "balance_loss_mlp": 1.04757047, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.05745701253476237, + "language_loss": 0.91846752, + "learning_rate": 0.0005485241061580624, + "loss": 0.92923677, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.29296875, + "step": 2521, + "time_per_iteration": 2.775069236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_mlp": 1.04995275, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05822253141450555, + "language_loss": 0.84573066, + "learning_rate": 0.0005482140250277228, + "loss": 0.8565352, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.3046875, + "step": 2522, + "time_per_iteration": 2.9740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_mlp": 1.05306387, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.06368999588379491, + "language_loss": 0.87678063, + "learning_rate": 0.0005479039251790387, + "loss": 0.88760674, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.29492188, + "step": 2523, + "time_per_iteration": 2.6360013484954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.05666256, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.060153636482772124, + "language_loss": 0.84925246, + "learning_rate": 0.0005475938067324014, + "loss": 0.8601191, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.29956055, + "step": 2524, + "time_per_iteration": 2.8053042888641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05542803, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.059684937302366806, + "language_loss": 0.83693206, + "learning_rate": 0.0005472836698082098, + "loss": 0.84777892, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.29199219, + "step": 2525, + "time_per_iteration": 2.513991355895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_mlp": 1.05587339, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.059033754749834536, + "language_loss": 0.84245414, + "learning_rate": 0.0005469735145268694, + "loss": 0.85330468, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.29174805, + "step": 2526, + "time_per_iteration": 2.758964776992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085929, + "balance_loss_mlp": 1.05712819, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.05692033512559974, + "language_loss": 0.80668163, + "learning_rate": 0.0005466633410087933, + "loss": 0.81754094, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.28808594, + "step": 2527, + "time_per_iteration": 2.7483773231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_mlp": 1.01712215, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.02025241925229164, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78289819, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11865234, + "step": 2528, + "time_per_iteration": 4.8671183586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084286, + "balance_loss_mlp": 1.05558062, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.060917910127877034, + "language_loss": 0.88050807, + "learning_rate": 0.0005460429397441214, + "loss": 0.89135092, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.28662109, + "step": 2529, + "time_per_iteration": 2.5488078594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.05416238, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06933582049293556, + "language_loss": 0.86551011, + "learning_rate": 0.0005457327122383866, + "loss": 0.87634516, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.29321289, + "step": 2530, + "time_per_iteration": 2.6199238300323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018983, + "balance_loss_mlp": 1.00711012, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.01657901033031013, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75655472, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.11865234, + "step": 2531, + "time_per_iteration": 4.810813665390015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.05754662, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.0731565805542322, + "language_loss": 0.75476754, + "learning_rate": 0.0005451122040823244, + "loss": 0.76563311, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.28979492, + "step": 2532, + "time_per_iteration": 2.7834720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0543766, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05844807259880667, + "language_loss": 0.7683785, + "learning_rate": 0.0005448019236728997, + "loss": 0.77921844, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.29589844, + "step": 2533, + "time_per_iteration": 2.9007680416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108612, + "balance_loss_mlp": 1.05789077, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.06352012335970622, + "language_loss": 0.84519851, + "learning_rate": 0.0005444916258698255, + "loss": 0.85605973, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.2824707, + "step": 2534, + "time_per_iteration": 2.6479434967041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083901, + "balance_loss_mlp": 1.05450428, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.06527387606118956, + "language_loss": 0.85987055, + "learning_rate": 0.0005441813107935704, + "loss": 0.8707096, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.29370117, + "step": 2535, + "time_per_iteration": 2.657701253890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.05359387, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05960574003717953, + "language_loss": 0.85425317, + "learning_rate": 0.0005438709785646091, + "loss": 0.86507541, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.28637695, + "step": 2536, + "time_per_iteration": 2.5686872005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081582, + "balance_loss_mlp": 1.05197084, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.0674154398441342, + "language_loss": 0.86857444, + "learning_rate": 0.0005435606293034234, + "loss": 0.87939024, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.29589844, + "step": 2537, + "time_per_iteration": 2.6792654991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108176, + "balance_loss_mlp": 1.05334091, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.1079718501079392, + "language_loss": 0.85096419, + "learning_rate": 0.0005432502631305016, + "loss": 0.86178184, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.28417969, + "step": 2538, + "time_per_iteration": 2.6790173053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082462, + "balance_loss_mlp": 1.05366075, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.270667674808598, + "language_loss": 0.83102262, + "learning_rate": 0.0005429398801663386, + "loss": 0.84184724, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.28808594, + "step": 2539, + "time_per_iteration": 2.9468812942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074127, + "balance_loss_mlp": 1.04453969, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.06499376102514318, + "language_loss": 0.82999051, + "learning_rate": 0.0005426294805314355, + "loss": 0.8407318, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.29541016, + "step": 2540, + "time_per_iteration": 4.142840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.04685867, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.055782244803189183, + "language_loss": 0.80130786, + "learning_rate": 0.0005423190643463003, + "loss": 0.81207728, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.30053711, + "step": 2541, + "time_per_iteration": 2.972822427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04237723, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.07101662394817357, + "language_loss": 0.83088171, + "learning_rate": 0.0005420086317314473, + "loss": 0.84160542, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.29956055, + "step": 2542, + "time_per_iteration": 2.651425838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.04180098, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.06479627692425034, + "language_loss": 0.81022084, + "learning_rate": 0.0005416981828073971, + "loss": 0.82094878, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.30957031, + "step": 2543, + "time_per_iteration": 2.775273323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111363, + "balance_loss_mlp": 1.09922981, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.045109342737372694, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78228641, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.14355469, + "step": 2544, + "time_per_iteration": 4.819438219070435 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.0383091, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.07868028775989613, + "language_loss": 0.85065794, + "learning_rate": 0.000541077236513819, + "loss": 0.86135024, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.30883789, + "step": 2545, + "time_per_iteration": 2.5191094875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.03981793, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.07130550478628667, + "language_loss": 0.82089663, + "learning_rate": 0.0005407667393853638, + "loss": 0.83161378, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31884766, + "step": 2546, + "time_per_iteration": 2.617934465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107245, + "balance_loss_mlp": 1.04043055, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.07826700951116618, + "language_loss": 0.8301416, + "learning_rate": 0.0005404562264298569, + "loss": 0.84086609, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32006836, + "step": 2547, + "time_per_iteration": 2.8667449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.03946531, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.06922547112322346, + "language_loss": 0.83528513, + "learning_rate": 0.0005401456977678498, + "loss": 0.8460055, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.32568359, + "step": 2548, + "time_per_iteration": 2.6317896842956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073611, + "balance_loss_mlp": 1.04216361, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06685231557649787, + "language_loss": 0.77518535, + "learning_rate": 0.0005398351535199008, + "loss": 0.78592145, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.31420898, + "step": 2549, + "time_per_iteration": 3.0532455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.046422, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.058433753989977806, + "language_loss": 0.83942944, + "learning_rate": 0.0005395245938065735, + "loss": 0.85020411, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31030273, + "step": 2550, + "time_per_iteration": 2.788081169128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082711, + "balance_loss_mlp": 1.0515734, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.08029752654472934, + "language_loss": 0.83026552, + "learning_rate": 0.0005392140187484379, + "loss": 0.84109271, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.3112793, + "step": 2551, + "time_per_iteration": 2.619982957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076344, + "balance_loss_mlp": 1.04577839, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.05951944251734202, + "language_loss": 0.89720619, + "learning_rate": 0.0005389034284660701, + "loss": 0.90796959, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.30541992, + "step": 2552, + "time_per_iteration": 2.811321258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084609, + "balance_loss_mlp": 1.05349529, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.06813620439924545, + "language_loss": 0.82330388, + "learning_rate": 0.000538592823080052, + "loss": 0.83414996, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.31079102, + "step": 2553, + "time_per_iteration": 3.121729612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_mlp": 1.05181932, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.10151417402847059, + "language_loss": 0.84795117, + "learning_rate": 0.000538282202710971, + "loss": 0.85879219, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.32275391, + "step": 2554, + "time_per_iteration": 2.5441434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089823, + "balance_loss_mlp": 1.05782735, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.08391436989004458, + "language_loss": 0.81955588, + "learning_rate": 0.000537971567479421, + "loss": 0.83045411, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.31982422, + "step": 2555, + "time_per_iteration": 2.742913246154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.05578029, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.0678126955236607, + "language_loss": 0.87735516, + "learning_rate": 0.0005376609175060011, + "loss": 0.88824058, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32763672, + "step": 2556, + "time_per_iteration": 2.5964388847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.05774164, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06456480219532172, + "language_loss": 0.80659723, + "learning_rate": 0.0005373502529113162, + "loss": 0.81748366, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.30883789, + "step": 2557, + "time_per_iteration": 2.8043599128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092017, + "balance_loss_mlp": 1.06009305, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.08818279105065703, + "language_loss": 0.81143486, + "learning_rate": 0.0005370395738159773, + "loss": 0.82235509, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.3190918, + "step": 2558, + "time_per_iteration": 2.6536951065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086446, + "balance_loss_mlp": 1.05516589, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.0699028851556838, + "language_loss": 0.83194804, + "learning_rate": 0.0005367288803406003, + "loss": 0.84281248, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3125, + "step": 2559, + "time_per_iteration": 2.6608238220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06075501, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05624800088650225, + "language_loss": 0.81485915, + "learning_rate": 0.0005364181726058073, + "loss": 0.82578236, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.31542969, + "step": 2560, + "time_per_iteration": 2.7245399951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_mlp": 1.05354452, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.0657433103973406, + "language_loss": 0.82255721, + "learning_rate": 0.0005361074507322261, + "loss": 0.83340329, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.31030273, + "step": 2561, + "time_per_iteration": 2.632309913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05359399, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.06588348626271129, + "language_loss": 0.81683809, + "learning_rate": 0.000535796714840489, + "loss": 0.82768893, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.31494141, + "step": 2562, + "time_per_iteration": 2.6455063819885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107827, + "balance_loss_mlp": 1.04686987, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.07506734855649709, + "language_loss": 0.84067267, + "learning_rate": 0.0005354859650512348, + "loss": 0.85145533, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.3137207, + "step": 2563, + "time_per_iteration": 2.8065779209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075102, + "balance_loss_mlp": 1.04396451, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06295276436461052, + "language_loss": 0.87103295, + "learning_rate": 0.0005351752014851074, + "loss": 0.88178396, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31103516, + "step": 2564, + "time_per_iteration": 2.573575019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078018, + "balance_loss_mlp": 1.04654717, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06464744293940616, + "language_loss": 0.83104938, + "learning_rate": 0.0005348644242627553, + "loss": 0.84182954, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.31445312, + "step": 2565, + "time_per_iteration": 2.730455160140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_mlp": 1.0458622, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.030733727476311833, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76345742, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.1328125, + "step": 2566, + "time_per_iteration": 4.939255237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05290508, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.06048394989907295, + "language_loss": 0.81127739, + "learning_rate": 0.0005342428293320013, + "loss": 0.82211566, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30908203, + "step": 2567, + "time_per_iteration": 2.7613086700439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079847, + "balance_loss_mlp": 1.04899621, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.0745931351859795, + "language_loss": 0.83762527, + "learning_rate": 0.0005339320118649238, + "loss": 0.84842372, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.30810547, + "step": 2568, + "time_per_iteration": 2.6934940814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.04763281, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.16404827309636982, + "language_loss": 0.86383307, + "learning_rate": 0.000533621181224271, + "loss": 0.87461007, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30053711, + "step": 2569, + "time_per_iteration": 2.7757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078612, + "balance_loss_mlp": 1.04737914, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.06859593656518678, + "language_loss": 0.81795698, + "learning_rate": 0.0005333103375307182, + "loss": 0.8287431, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.31201172, + "step": 2570, + "time_per_iteration": 2.8319950103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_mlp": 1.043221, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.05293986738306163, + "language_loss": 0.86142224, + "learning_rate": 0.0005329994809049451, + "loss": 0.87216723, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.3125, + "step": 2571, + "time_per_iteration": 2.7592415809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075993, + "balance_loss_mlp": 1.04540396, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05076322771290774, + "language_loss": 0.87883997, + "learning_rate": 0.0005326886114676375, + "loss": 0.88959992, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.30541992, + "step": 2572, + "time_per_iteration": 2.9501779079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077876, + "balance_loss_mlp": 1.0463568, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.06323365720535751, + "language_loss": 0.87792003, + "learning_rate": 0.0005323777293394854, + "loss": 0.8886987, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.31494141, + "step": 2573, + "time_per_iteration": 2.55361008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107249, + "balance_loss_mlp": 1.03975475, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.05535210432037286, + "language_loss": 0.81776071, + "learning_rate": 0.000532066834641184, + "loss": 0.82848555, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32739258, + "step": 2574, + "time_per_iteration": 2.6631722450256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070737, + "balance_loss_mlp": 1.03900313, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.06817735062049093, + "language_loss": 0.8516283, + "learning_rate": 0.0005317559274934334, + "loss": 0.86233568, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.31713867, + "step": 2575, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072086, + "balance_loss_mlp": 1.03894639, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.05802348124776455, + "language_loss": 0.80394173, + "learning_rate": 0.0005314450080169382, + "loss": 0.81466264, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33154297, + "step": 2576, + "time_per_iteration": 2.6343159675598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076196, + "balance_loss_mlp": 1.04391456, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.07974947058861337, + "language_loss": 0.80607754, + "learning_rate": 0.0005311340763324083, + "loss": 0.81683946, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.32275391, + "step": 2577, + "time_per_iteration": 2.557796001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078498, + "balance_loss_mlp": 1.04557252, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.05295897633494548, + "language_loss": 0.82240456, + "learning_rate": 0.0005308231325605578, + "loss": 0.83318955, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.32910156, + "step": 2578, + "time_per_iteration": 2.6799750328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072444, + "balance_loss_mlp": 1.03992367, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.05054804003557779, + "language_loss": 0.7645728, + "learning_rate": 0.0005305121768221061, + "loss": 0.77529716, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.32519531, + "step": 2579, + "time_per_iteration": 3.074568748474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_mlp": 1.01057923, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02258142627415349, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76063395, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14453125, + "step": 2580, + "time_per_iteration": 4.807044267654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079853, + "balance_loss_mlp": 1.04749966, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.06889886772880317, + "language_loss": 0.9145242, + "learning_rate": 0.0005298902299282984, + "loss": 0.92532271, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.32348633, + "step": 2581, + "time_per_iteration": 2.6145668029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077544, + "balance_loss_mlp": 1.04561996, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.06407878407439609, + "language_loss": 0.84137404, + "learning_rate": 0.0005295792390144033, + "loss": 0.85214949, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.3190918, + "step": 2582, + "time_per_iteration": 2.71272873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083171, + "balance_loss_mlp": 1.05103219, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.07436197165654145, + "language_loss": 0.83241105, + "learning_rate": 0.0005292682366168294, + "loss": 0.84324276, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.32128906, + "step": 2583, + "time_per_iteration": 2.5284125804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082483, + "balance_loss_mlp": 1.05079746, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.07965760723765093, + "language_loss": 0.79750967, + "learning_rate": 0.0005289572228563181, + "loss": 0.80833459, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.31665039, + "step": 2584, + "time_per_iteration": 2.802370548248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.04862666, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.06536047089469768, + "language_loss": 0.83144403, + "learning_rate": 0.000528646197853616, + "loss": 0.84224886, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.31835938, + "step": 2585, + "time_per_iteration": 2.7075467109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076886, + "balance_loss_mlp": 1.04748917, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.11136041462628715, + "language_loss": 0.85364115, + "learning_rate": 0.0005283351617294735, + "loss": 0.86440998, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.29370117, + "step": 2586, + "time_per_iteration": 2.940826892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_mlp": 1.0143584, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01813039431029953, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.7766428, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.1328125, + "step": 2587, + "time_per_iteration": 4.996971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082207, + "balance_loss_mlp": 1.05278599, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05663819997496981, + "language_loss": 0.86729956, + "learning_rate": 0.0005277130565998916, + "loss": 0.87812161, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.29394531, + "step": 2588, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_mlp": 1.05401921, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.07264241635107661, + "language_loss": 0.82111955, + "learning_rate": 0.0005274019878359748, + "loss": 0.83195567, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.29541016, + "step": 2589, + "time_per_iteration": 2.7199792861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081352, + "balance_loss_mlp": 1.05102515, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.07554474334702437, + "language_loss": 0.86675328, + "learning_rate": 0.0005270909084336628, + "loss": 0.87756681, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.30297852, + "step": 2590, + "time_per_iteration": 2.6305181980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080877, + "balance_loss_mlp": 1.05045462, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.06751539177219479, + "language_loss": 0.89032745, + "learning_rate": 0.0005267798185137276, + "loss": 0.90113628, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.30371094, + "step": 2591, + "time_per_iteration": 2.608088254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_mlp": 1.05743146, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.0633807963563003, + "language_loss": 0.8924402, + "learning_rate": 0.0005264687181969444, + "loss": 0.90332258, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.30786133, + "step": 2592, + "time_per_iteration": 2.729546308517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088496, + "balance_loss_mlp": 1.05931377, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06112732681279078, + "language_loss": 0.75084651, + "learning_rate": 0.0005261576076040937, + "loss": 0.76173151, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.29199219, + "step": 2593, + "time_per_iteration": 3.265289783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082947, + "balance_loss_mlp": 1.05281067, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.0783599565062882, + "language_loss": 0.84088343, + "learning_rate": 0.0005258464868559591, + "loss": 0.85171294, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.30078125, + "step": 2594, + "time_per_iteration": 2.657191514968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080029, + "balance_loss_mlp": 1.04991674, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.0699675322535813, + "language_loss": 0.88836402, + "learning_rate": 0.0005255353560733284, + "loss": 0.89916426, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.30102539, + "step": 2595, + "time_per_iteration": 2.570439100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_mlp": 1.04640186, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.029272008197333242, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76637447, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.12353516, + "step": 2596, + "time_per_iteration": 4.808587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084167, + "balance_loss_mlp": 1.05476975, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052965599041123274, + "language_loss": 0.83342099, + "learning_rate": 0.0005249130648877492, + "loss": 0.84426272, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.29370117, + "step": 2597, + "time_per_iteration": 2.7453384399414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010849, + "balance_loss_mlp": 1.05524063, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05960347084431116, + "language_loss": 0.84714389, + "learning_rate": 0.0005246019047263953, + "loss": 0.85799289, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.29614258, + "step": 2598, + "time_per_iteration": 2.488004684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091385, + "balance_loss_mlp": 1.06220269, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.06961248878544336, + "language_loss": 0.8223601, + "learning_rate": 0.0005242907350137353, + "loss": 0.83327389, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.29174805, + "step": 2599, + "time_per_iteration": 2.550495147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092431, + "balance_loss_mlp": 1.06422567, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06813860338073652, + "language_loss": 0.78928339, + "learning_rate": 0.0005239795558705754, + "loss": 0.80020773, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.28198242, + "step": 2600, + "time_per_iteration": 2.656519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094846, + "balance_loss_mlp": 1.06492448, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05508549334218052, + "language_loss": 0.89073658, + "learning_rate": 0.0005236683674177264, + "loss": 0.90168506, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.29907227, + "step": 2601, + "time_per_iteration": 2.63960337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098261, + "balance_loss_mlp": 1.06886423, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.06683201790232274, + "language_loss": 0.82384604, + "learning_rate": 0.0005233571697760021, + "loss": 0.83482862, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.29345703, + "step": 2602, + "time_per_iteration": 2.859165668487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06814075, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.06216601268510387, + "language_loss": 0.83124363, + "learning_rate": 0.0005230459630662203, + "loss": 0.84222066, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.29541016, + "step": 2603, + "time_per_iteration": 2.9592032432556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093592, + "balance_loss_mlp": 1.06479144, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.0707725537041266, + "language_loss": 0.81070089, + "learning_rate": 0.0005227347474092022, + "loss": 0.8216368, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.2878418, + "step": 2604, + "time_per_iteration": 2.7389962673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545365, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.05232832672790962, + "language_loss": 0.83514917, + "learning_rate": 0.0005224235229257724, + "loss": 0.84609556, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.29174805, + "step": 2605, + "time_per_iteration": 2.687992811203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.05914283, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.056206575952308185, + "language_loss": 0.8630116, + "learning_rate": 0.0005221122897367589, + "loss": 0.87389988, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.29614258, + "step": 2606, + "time_per_iteration": 2.787410259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.05861855, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.07695466326694751, + "language_loss": 0.81035262, + "learning_rate": 0.0005218010479629932, + "loss": 0.82123399, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.29467773, + "step": 2607, + "time_per_iteration": 2.6562912464141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.06177175, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.05799380231795743, + "language_loss": 0.81869501, + "learning_rate": 0.0005214897977253102, + "loss": 0.82961148, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.29833984, + "step": 2608, + "time_per_iteration": 2.6560218334198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_mlp": 1.05454254, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.06343008203006618, + "language_loss": 0.84223098, + "learning_rate": 0.0005211785391445473, + "loss": 0.85307777, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.30102539, + "step": 2609, + "time_per_iteration": 2.726686954498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_mlp": 1.05202734, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.06012661278609564, + "language_loss": 0.79186547, + "learning_rate": 0.0005208672723415467, + "loss": 0.80267924, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.29345703, + "step": 2610, + "time_per_iteration": 2.7944774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108238, + "balance_loss_mlp": 1.05212474, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.06559501481836318, + "language_loss": 0.79065204, + "learning_rate": 0.0005205559974371525, + "loss": 0.80147582, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.30224609, + "step": 2611, + "time_per_iteration": 2.7519257068634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081519, + "balance_loss_mlp": 1.05150175, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05612255210767107, + "language_loss": 0.82192892, + "learning_rate": 0.0005202447145522123, + "loss": 0.83274412, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.29980469, + "step": 2612, + "time_per_iteration": 2.6770236492156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079077, + "balance_loss_mlp": 1.04965591, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05250196134528315, + "language_loss": 0.79193181, + "learning_rate": 0.0005199334238075769, + "loss": 0.80272257, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.29370117, + "step": 2613, + "time_per_iteration": 2.5337562561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107987, + "balance_loss_mlp": 1.04942441, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.0529792440436354, + "language_loss": 0.9204368, + "learning_rate": 0.0005196221253241, + "loss": 0.93123555, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.30419922, + "step": 2614, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04276693, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.06195019445138367, + "language_loss": 0.82918042, + "learning_rate": 0.0005193108192226383, + "loss": 0.83991992, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.31152344, + "step": 2615, + "time_per_iteration": 2.757087230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080642, + "balance_loss_mlp": 1.04990983, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.05317989185447873, + "language_loss": 0.8697142, + "learning_rate": 0.000518999505624052, + "loss": 0.88052064, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.30712891, + "step": 2616, + "time_per_iteration": 2.7251224517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078998, + "balance_loss_mlp": 1.04759884, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.059314577611761586, + "language_loss": 0.83379316, + "learning_rate": 0.000518688184649203, + "loss": 0.84458327, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.3137207, + "step": 2617, + "time_per_iteration": 2.809063673019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107933, + "balance_loss_mlp": 1.04890776, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.08232681701976922, + "language_loss": 0.83759677, + "learning_rate": 0.0005183768564189577, + "loss": 0.8483901, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.30395508, + "step": 2618, + "time_per_iteration": 2.5442681312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_mlp": 1.05502236, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.10233936422342303, + "language_loss": 0.81248713, + "learning_rate": 0.0005180655210541838, + "loss": 0.8233487, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31103516, + "step": 2619, + "time_per_iteration": 2.5986533164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04976153, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.10286286455085811, + "language_loss": 0.83096433, + "learning_rate": 0.0005177541786757527, + "loss": 0.84175664, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.29443359, + "step": 2620, + "time_per_iteration": 2.7542781829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04971933, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.062363268760676084, + "language_loss": 0.82867718, + "learning_rate": 0.000517442829404538, + "loss": 0.83948314, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.30834961, + "step": 2621, + "time_per_iteration": 2.9758973121643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080161, + "balance_loss_mlp": 1.05000091, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.06818258917584033, + "language_loss": 0.8721652, + "learning_rate": 0.0005171314733614166, + "loss": 0.88296676, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.30102539, + "step": 2622, + "time_per_iteration": 2.8933780193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082583, + "balance_loss_mlp": 1.05235183, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.06917321427090362, + "language_loss": 0.78315443, + "learning_rate": 0.0005168201106672671, + "loss": 0.79398024, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.30200195, + "step": 2623, + "time_per_iteration": 2.763855457305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.05093241, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.06294733427077812, + "language_loss": 0.84776348, + "learning_rate": 0.0005165087414429717, + "loss": 0.85857534, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.30200195, + "step": 2624, + "time_per_iteration": 2.6454148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04967785, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.07820570667172376, + "language_loss": 0.83597136, + "learning_rate": 0.0005161973658094144, + "loss": 0.84677643, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.30810547, + "step": 2625, + "time_per_iteration": 2.630192756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075312, + "balance_loss_mlp": 1.04562938, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.10754310805258371, + "language_loss": 0.8215518, + "learning_rate": 0.000515885983887482, + "loss": 0.83230495, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.29614258, + "step": 2626, + "time_per_iteration": 2.762484312057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082022, + "balance_loss_mlp": 1.05179107, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.060931372363222436, + "language_loss": 0.84606075, + "learning_rate": 0.0005155745957980636, + "loss": 0.85688096, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.30175781, + "step": 2627, + "time_per_iteration": 2.597625494003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04513431, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.060140239439456865, + "language_loss": 0.8829447, + "learning_rate": 0.000515263201662051, + "loss": 0.89370334, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.30688477, + "step": 2628, + "time_per_iteration": 2.676429510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081664, + "balance_loss_mlp": 1.05162382, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05201747216110034, + "language_loss": 0.82525623, + "learning_rate": 0.0005149518016003378, + "loss": 0.83607286, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.30004883, + "step": 2629, + "time_per_iteration": 3.1674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.04874492, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.12452297981638945, + "language_loss": 0.82290918, + "learning_rate": 0.0005146403957338206, + "loss": 0.83369756, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30029297, + "step": 2630, + "time_per_iteration": 2.574908494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075266, + "balance_loss_mlp": 1.04415226, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.054026792513587725, + "language_loss": 0.81795335, + "learning_rate": 0.0005143289841833975, + "loss": 0.82870597, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31079102, + "step": 2631, + "time_per_iteration": 2.8753445148468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.04044628, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.07665080268010696, + "language_loss": 0.82169271, + "learning_rate": 0.0005140175670699696, + "loss": 0.83241099, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.31347656, + "step": 2632, + "time_per_iteration": 2.606656551361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070677, + "balance_loss_mlp": 1.03989697, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.05365826465054309, + "language_loss": 0.82773447, + "learning_rate": 0.0005137061445144395, + "loss": 0.83844125, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.30737305, + "step": 2633, + "time_per_iteration": 2.908146619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.0429641, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.06908817272508659, + "language_loss": 0.87031686, + "learning_rate": 0.000513394716637712, + "loss": 0.88106334, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.31665039, + "step": 2634, + "time_per_iteration": 2.804591417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03547585, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.027149993512400487, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80241489, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.14257812, + "step": 2635, + "time_per_iteration": 4.903238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.03977799, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.05829667092367474, + "language_loss": 0.80886006, + "learning_rate": 0.0005127718454042958, + "loss": 0.81957495, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.31689453, + "step": 2636, + "time_per_iteration": 2.81962513923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076357, + "balance_loss_mlp": 1.04467094, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.06782185148260642, + "language_loss": 0.84239292, + "learning_rate": 0.0005124604022894269, + "loss": 0.85315657, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.31665039, + "step": 2637, + "time_per_iteration": 2.933143377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023059, + "balance_loss_mlp": 1.00932586, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.016037159370544805, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78211284, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.13769531, + "step": 2638, + "time_per_iteration": 4.81339168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080028, + "balance_loss_mlp": 1.04786575, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.058900205072543066, + "language_loss": 0.83262694, + "learning_rate": 0.0005118375016679325, + "loss": 0.84342724, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.3215332, + "step": 2639, + "time_per_iteration": 2.7476773262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076278, + "balance_loss_mlp": 1.04490256, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.08436499818571505, + "language_loss": 0.80410182, + "learning_rate": 0.0005115260444031382, + "loss": 0.81486464, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.31347656, + "step": 2640, + "time_per_iteration": 2.579087734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016776, + "balance_loss_mlp": 1.00361574, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.010326775178219767, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79748595, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.13183594, + "step": 2641, + "time_per_iteration": 4.939114809036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077717, + "balance_loss_mlp": 1.04665077, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.06392423646026814, + "language_loss": 0.86441147, + "learning_rate": 0.0005109031165700483, + "loss": 0.87518859, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.31030273, + "step": 2642, + "time_per_iteration": 2.572248935699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.04809904, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.08514760687851525, + "language_loss": 0.83290648, + "learning_rate": 0.0005105916462435945, + "loss": 0.84369576, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.30786133, + "step": 2643, + "time_per_iteration": 2.832653284072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.05089569, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.05584396132467612, + "language_loss": 0.85012162, + "learning_rate": 0.0005102801718050989, + "loss": 0.86093414, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.30322266, + "step": 2644, + "time_per_iteration": 2.6693568229675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.04755831, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.07396400679887168, + "language_loss": 0.89154196, + "learning_rate": 0.0005099686933754867, + "loss": 0.9023155, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.29785156, + "step": 2645, + "time_per_iteration": 2.688992977142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080157, + "balance_loss_mlp": 1.05016422, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.06521042739972126, + "language_loss": 0.84349567, + "learning_rate": 0.0005096572110756845, + "loss": 0.85429722, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.29956055, + "step": 2646, + "time_per_iteration": 2.694018840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080367, + "balance_loss_mlp": 1.05065989, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.049776737751643374, + "language_loss": 0.85623205, + "learning_rate": 0.0005093457250266205, + "loss": 0.86703575, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.296875, + "step": 2647, + "time_per_iteration": 2.69240665435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_mlp": 1.05527472, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.0639130152108818, + "language_loss": 0.83146644, + "learning_rate": 0.000509034235349224, + "loss": 0.84231722, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.29760742, + "step": 2648, + "time_per_iteration": 2.69409441947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_mlp": 1.05499578, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.07990516858852505, + "language_loss": 0.81340408, + "learning_rate": 0.0005087227421644266, + "loss": 0.82424831, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.29345703, + "step": 2649, + "time_per_iteration": 2.7338664531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.05795491, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.06481094949829869, + "language_loss": 0.86482179, + "learning_rate": 0.0005084112455931602, + "loss": 0.87570059, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.29907227, + "step": 2650, + "time_per_iteration": 2.5772013664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085843, + "balance_loss_mlp": 1.05561161, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.060404574220966636, + "language_loss": 0.84966755, + "learning_rate": 0.0005080997457563586, + "loss": 0.86052603, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.30200195, + "step": 2651, + "time_per_iteration": 2.5539023876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089212, + "balance_loss_mlp": 1.05895662, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06895787175374923, + "language_loss": 0.79026747, + "learning_rate": 0.0005077882427749569, + "loss": 0.80115962, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.30224609, + "step": 2652, + "time_per_iteration": 2.5036137104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.06367242, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06232251007114316, + "language_loss": 0.84676695, + "learning_rate": 0.0005074767367698913, + "loss": 0.85770237, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.29833984, + "step": 2653, + "time_per_iteration": 2.6879539489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088747, + "balance_loss_mlp": 1.05875421, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.07002300864013745, + "language_loss": 0.83262461, + "learning_rate": 0.0005071652278620988, + "loss": 0.84351206, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.29956055, + "step": 2654, + "time_per_iteration": 3.048330307006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093234, + "balance_loss_mlp": 1.06369376, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.077240918193036, + "language_loss": 0.83515394, + "learning_rate": 0.0005068537161725186, + "loss": 0.84608626, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.29492188, + "step": 2655, + "time_per_iteration": 2.7864887714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088669, + "balance_loss_mlp": 1.05941546, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.06396168128091786, + "language_loss": 0.84455109, + "learning_rate": 0.0005065422018220893, + "loss": 0.85543782, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.29223633, + "step": 2656, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095041, + "balance_loss_mlp": 1.0650475, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.0709037558233959, + "language_loss": 0.7998327, + "learning_rate": 0.0005062306849317521, + "loss": 0.81078309, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.29956055, + "step": 2657, + "time_per_iteration": 2.7980425357818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010852, + "balance_loss_mlp": 1.05484891, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.0652959904845647, + "language_loss": 0.83424717, + "learning_rate": 0.0005059191656224487, + "loss": 0.84509915, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30297852, + "step": 2658, + "time_per_iteration": 2.735557794570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_mlp": 1.05488813, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.05645977889013881, + "language_loss": 0.89198554, + "learning_rate": 0.0005056076440151212, + "loss": 0.90283966, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.3046875, + "step": 2659, + "time_per_iteration": 2.651273012161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136875, + "balance_loss_mlp": 1.12314212, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.05420368374393455, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77424991, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.13769531, + "step": 2660, + "time_per_iteration": 4.8447229862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_mlp": 1.05689311, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.04523661755748661, + "language_loss": 0.87268543, + "learning_rate": 0.0005049845943901691, + "loss": 0.88354003, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.28515625, + "step": 2661, + "time_per_iteration": 2.855107307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.05092359, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05522645200412479, + "language_loss": 0.86379933, + "learning_rate": 0.0005046730666144338, + "loss": 0.87459898, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.2902832, + "step": 2662, + "time_per_iteration": 2.841339349746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082682, + "balance_loss_mlp": 1.05390453, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.05374936854204756, + "language_loss": 0.87915027, + "learning_rate": 0.0005043615370244532, + "loss": 0.8899771, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.2878418, + "step": 2663, + "time_per_iteration": 3.364856004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03728747, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.022479341124125186, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79294169, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.125, + "step": 2664, + "time_per_iteration": 4.635313510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080439, + "balance_loss_mlp": 1.05163848, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04479435391735135, + "language_loss": 0.85200715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86281157, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.28808594, + "step": 2665, + "time_per_iteration": 2.7995188236236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083297, + "balance_loss_mlp": 1.05356586, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.0801864670549744, + "language_loss": 0.84280151, + "learning_rate": 0.0005034269385785075, + "loss": 0.85363448, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.29711914, + "step": 2666, + "time_per_iteration": 2.673332929611206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090699, + "balance_loss_mlp": 1.0623982, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06501156427369086, + "language_loss": 0.84454274, + "learning_rate": 0.0005031154029410168, + "loss": 0.85544968, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.28344727, + "step": 2667, + "time_per_iteration": 2.5442566871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086564, + "balance_loss_mlp": 1.0577395, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06480382372099369, + "language_loss": 0.86841118, + "learning_rate": 0.0005028038660940197, + "loss": 0.87927675, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.28808594, + "step": 2668, + "time_per_iteration": 2.62888765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077032, + "balance_loss_mlp": 1.04832673, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.05084400085528349, + "language_loss": 0.84573722, + "learning_rate": 0.0005024923281584648, + "loss": 0.85650754, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.28662109, + "step": 2669, + "time_per_iteration": 2.6316568851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092041, + "balance_loss_mlp": 1.06312072, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.05870793453685439, + "language_loss": 0.82656723, + "learning_rate": 0.0005021807892553026, + "loss": 0.83748764, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.28881836, + "step": 2670, + "time_per_iteration": 2.707345724105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093085, + "balance_loss_mlp": 1.06457078, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.08829821247143162, + "language_loss": 0.84517181, + "learning_rate": 0.0005018692495054828, + "loss": 0.85610259, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.28540039, + "step": 2671, + "time_per_iteration": 2.758309841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092768, + "balance_loss_mlp": 1.06399131, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05555500929459815, + "language_loss": 0.80821186, + "learning_rate": 0.0005015577090299561, + "loss": 0.8191396, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.28735352, + "step": 2672, + "time_per_iteration": 2.6883137226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.06125236, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.06705414985084517, + "language_loss": 0.86672199, + "learning_rate": 0.0005012461679496729, + "loss": 0.87762225, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.28759766, + "step": 2673, + "time_per_iteration": 2.5949177742004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092599, + "balance_loss_mlp": 1.0630827, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.06054107713253035, + "language_loss": 0.87204134, + "learning_rate": 0.0005009346263855848, + "loss": 0.88296735, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.29467773, + "step": 2674, + "time_per_iteration": 2.6084070205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093368, + "balance_loss_mlp": 1.06401849, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.08912792131396882, + "language_loss": 0.83928424, + "learning_rate": 0.0005006230844586422, + "loss": 0.85021788, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.29345703, + "step": 2675, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06496692, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.06185145068902706, + "language_loss": 0.79025733, + "learning_rate": 0.0005003115422897968, + "loss": 0.80119741, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.29052734, + "step": 2676, + "time_per_iteration": 2.7350447177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_mlp": 1.05780196, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.06610854708750855, + "language_loss": 0.86982405, + "learning_rate": 0.0005, + "loss": 0.88070583, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.30322266, + "step": 2677, + "time_per_iteration": 2.62941837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082976, + "balance_loss_mlp": 1.0535078, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.05650592481949535, + "language_loss": 0.7918483, + "learning_rate": 0.0004996884577102033, + "loss": 0.80267811, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.29418945, + "step": 2678, + "time_per_iteration": 3.1128311157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085723, + "balance_loss_mlp": 1.05577731, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.05289591163695072, + "language_loss": 0.84550285, + "learning_rate": 0.000499376915541358, + "loss": 0.85636008, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.29907227, + "step": 2679, + "time_per_iteration": 2.709259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_mlp": 1.0510838, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.05812477607611756, + "language_loss": 0.81116259, + "learning_rate": 0.0004990653736144155, + "loss": 0.82198453, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31079102, + "step": 2680, + "time_per_iteration": 2.8433125019073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083796, + "balance_loss_mlp": 1.05318332, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.06443376303588658, + "language_loss": 0.8582924, + "learning_rate": 0.0004987538320503271, + "loss": 0.86913037, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.30566406, + "step": 2681, + "time_per_iteration": 2.492128372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04860437, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.06119575969443392, + "language_loss": 0.83057904, + "learning_rate": 0.0004984422909700442, + "loss": 0.84137553, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.31005859, + "step": 2682, + "time_per_iteration": 2.6817965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04560328, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.06357079240733023, + "language_loss": 0.83849651, + "learning_rate": 0.0004981307504945173, + "loss": 0.84926826, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31542969, + "step": 2683, + "time_per_iteration": 2.6884219646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04764211, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.058627663819765745, + "language_loss": 0.89028186, + "learning_rate": 0.0004978192107446976, + "loss": 0.90106535, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.30664062, + "step": 2684, + "time_per_iteration": 2.7606394290924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074512, + "balance_loss_mlp": 1.04397011, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05338243685455816, + "language_loss": 0.870161, + "learning_rate": 0.0004975076718415353, + "loss": 0.88090611, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30493164, + "step": 2685, + "time_per_iteration": 2.594937562942505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081075, + "balance_loss_mlp": 1.04991364, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.06078629774986462, + "language_loss": 0.90568233, + "learning_rate": 0.0004971961339059806, + "loss": 0.91649306, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.3112793, + "step": 2686, + "time_per_iteration": 2.4705729484558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075772, + "balance_loss_mlp": 1.04406273, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.067622669815522, + "language_loss": 0.83813852, + "learning_rate": 0.0004968845970589832, + "loss": 0.84889627, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.31689453, + "step": 2687, + "time_per_iteration": 2.6784517765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108779, + "balance_loss_mlp": 1.05760634, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06982295057413529, + "language_loss": 0.84568465, + "learning_rate": 0.0004965730614214926, + "loss": 0.85656255, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.30151367, + "step": 2688, + "time_per_iteration": 2.628742218017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078435, + "balance_loss_mlp": 1.0470829, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.06558972316908819, + "language_loss": 0.85422957, + "learning_rate": 0.0004962615271144576, + "loss": 0.86501396, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.31323242, + "step": 2689, + "time_per_iteration": 2.5566818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079558, + "balance_loss_mlp": 1.04923093, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.32559574880762837, + "language_loss": 0.82639515, + "learning_rate": 0.0004959499942588264, + "loss": 0.83719069, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.30273438, + "step": 2690, + "time_per_iteration": 2.8994317054748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057487, + "balance_loss_mlp": 1.04442203, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.028996752449645728, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79257512, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.13085938, + "step": 2691, + "time_per_iteration": 4.746784687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109471, + "balance_loss_mlp": 1.07830977, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.12339515707636219, + "language_loss": 0.85558736, + "learning_rate": 0.0004953269333855661, + "loss": 0.86668211, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.3112793, + "step": 2692, + "time_per_iteration": 2.8191914558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07991028, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.07785846219337349, + "language_loss": 0.84034789, + "learning_rate": 0.0004950154056098309, + "loss": 0.85143995, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.29272461, + "step": 2693, + "time_per_iteration": 2.686821222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129818, + "balance_loss_mlp": 1.09963465, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.07144537100010277, + "language_loss": 0.83820134, + "learning_rate": 0.0004947038797692867, + "loss": 0.84949952, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.30126953, + "step": 2694, + "time_per_iteration": 2.8041090965270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128051, + "balance_loss_mlp": 1.09741426, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.06183052783496024, + "language_loss": 0.77540803, + "learning_rate": 0.0004943923559848789, + "loss": 0.78668851, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.3059082, + "step": 2695, + "time_per_iteration": 2.797661781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127895, + "balance_loss_mlp": 1.09756875, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.054443821670517534, + "language_loss": 0.90626478, + "learning_rate": 0.0004940808343775515, + "loss": 0.91754371, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.30297852, + "step": 2696, + "time_per_iteration": 2.708075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126092, + "balance_loss_mlp": 1.09593177, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.08653085411735448, + "language_loss": 0.82187402, + "learning_rate": 0.0004937693150682479, + "loss": 0.83313495, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.30126953, + "step": 2697, + "time_per_iteration": 2.5607407093048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116261, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.07683001308624603, + "language_loss": 0.76774538, + "learning_rate": 0.0004934577981779107, + "loss": 0.77890801, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.30175781, + "step": 2698, + "time_per_iteration": 2.730090618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112238, + "balance_loss_mlp": 1.0813148, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.05605263998280499, + "language_loss": 0.81117129, + "learning_rate": 0.0004931462838274817, + "loss": 0.82229376, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.30883789, + "step": 2699, + "time_per_iteration": 2.847720146179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_mlp": 1.07957006, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.0574424557407856, + "language_loss": 0.84004086, + "learning_rate": 0.0004928347721379011, + "loss": 0.85114038, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.30322266, + "step": 2700, + "time_per_iteration": 2.6999762058258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_mlp": 1.07185948, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.05483286228362013, + "language_loss": 0.82044077, + "learning_rate": 0.0004925232632301089, + "loss": 0.83146882, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.30908203, + "step": 2701, + "time_per_iteration": 2.560593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098243, + "balance_loss_mlp": 1.06791615, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.06379159996009351, + "language_loss": 0.79575932, + "learning_rate": 0.0004922117572250431, + "loss": 0.80674177, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.30273438, + "step": 2702, + "time_per_iteration": 2.6621010303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094553, + "balance_loss_mlp": 1.0648458, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.06234734694325623, + "language_loss": 0.80990833, + "learning_rate": 0.0004919002542436414, + "loss": 0.82085389, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.296875, + "step": 2703, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.06806874, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.11086337696641164, + "language_loss": 0.81129456, + "learning_rate": 0.0004915887544068399, + "loss": 0.82227564, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.29980469, + "step": 2704, + "time_per_iteration": 2.6579208374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097204, + "balance_loss_mlp": 1.06787837, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.06500287710368027, + "language_loss": 0.78155613, + "learning_rate": 0.0004912772578355736, + "loss": 0.79252815, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.29296875, + "step": 2705, + "time_per_iteration": 2.93152117729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.06395674, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.05937288472032104, + "language_loss": 0.82798421, + "learning_rate": 0.000490965764650776, + "loss": 0.83892947, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.30541992, + "step": 2706, + "time_per_iteration": 2.914069414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090504, + "balance_loss_mlp": 1.06048679, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.08994605713309432, + "language_loss": 0.82582623, + "learning_rate": 0.0004906542749733798, + "loss": 0.83673131, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.29980469, + "step": 2707, + "time_per_iteration": 3.632612943649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.05647707, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.05099864574791971, + "language_loss": 0.85112798, + "learning_rate": 0.0004903427889243156, + "loss": 0.86199224, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.29907227, + "step": 2708, + "time_per_iteration": 2.860605001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05898452, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.058285600596581014, + "language_loss": 0.85712206, + "learning_rate": 0.0004900313066245134, + "loss": 0.86801398, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.30151367, + "step": 2709, + "time_per_iteration": 2.6910862922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078824, + "balance_loss_mlp": 1.04873538, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.06298998318770882, + "language_loss": 0.81023324, + "learning_rate": 0.0004897198281949012, + "loss": 0.8210215, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.30029297, + "step": 2710, + "time_per_iteration": 2.660783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085709, + "balance_loss_mlp": 1.0563364, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.06559869836216795, + "language_loss": 0.77832824, + "learning_rate": 0.0004894083537564057, + "loss": 0.78918535, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.29345703, + "step": 2711, + "time_per_iteration": 2.7276909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079715, + "balance_loss_mlp": 1.04965043, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.0684248274147048, + "language_loss": 0.80827081, + "learning_rate": 0.0004890968834299519, + "loss": 0.81906796, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.30029297, + "step": 2712, + "time_per_iteration": 2.738229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.04974508, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.061787257592987296, + "language_loss": 0.78808606, + "learning_rate": 0.0004887854173364633, + "loss": 0.79888272, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.29882812, + "step": 2713, + "time_per_iteration": 2.734443426132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074151, + "balance_loss_mlp": 1.04480171, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.05102910961180143, + "language_loss": 0.81491256, + "learning_rate": 0.0004884739555968617, + "loss": 0.82565403, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.29272461, + "step": 2714, + "time_per_iteration": 2.867036819458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.05559933, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.021468860083039186, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80046767, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.14160156, + "step": 2715, + "time_per_iteration": 4.962530851364136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04559731, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.06298546380073215, + "language_loss": 0.86646473, + "learning_rate": 0.0004878510456629992, + "loss": 0.87722689, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.30566406, + "step": 2716, + "time_per_iteration": 2.9603123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081784, + "balance_loss_mlp": 1.05110002, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.07025764068668285, + "language_loss": 0.85336471, + "learning_rate": 0.00048753959771057314, + "loss": 0.86418259, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.30639648, + "step": 2717, + "time_per_iteration": 2.632622480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_mlp": 1.05389357, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.05729998182106491, + "language_loss": 0.82715809, + "learning_rate": 0.0004872281545957044, + "loss": 0.83801079, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.31347656, + "step": 2718, + "time_per_iteration": 2.7305338382720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078735, + "balance_loss_mlp": 1.04726386, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.058019575066879846, + "language_loss": 0.86264348, + "learning_rate": 0.0004869167164393055, + "loss": 0.87343085, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.31445312, + "step": 2719, + "time_per_iteration": 2.9418067932128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_mlp": 1.04472566, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.0640312473735956, + "language_loss": 0.89536262, + "learning_rate": 0.00048660528336228793, + "loss": 0.90611863, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.30834961, + "step": 2720, + "time_per_iteration": 2.8314764499664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04506063, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.05104764752581424, + "language_loss": 0.89906192, + "learning_rate": 0.0004862938554855606, + "loss": 0.90981793, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.30517578, + "step": 2721, + "time_per_iteration": 2.7912685871124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.04705238, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.09225462001304952, + "language_loss": 0.86140561, + "learning_rate": 0.0004859824329300304, + "loss": 0.87217844, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.30200195, + "step": 2722, + "time_per_iteration": 2.5850255489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081058, + "balance_loss_mlp": 1.0504688, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.05217438950511115, + "language_loss": 0.83504456, + "learning_rate": 0.00048567101581660244, + "loss": 0.84585512, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.30541992, + "step": 2723, + "time_per_iteration": 2.6090264320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.04712343, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.07777816613104971, + "language_loss": 0.8713702, + "learning_rate": 0.00048535960426617956, + "loss": 0.88215029, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.30834961, + "step": 2724, + "time_per_iteration": 2.6143879890441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079989, + "balance_loss_mlp": 1.04966187, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.061907794652793086, + "language_loss": 0.81729943, + "learning_rate": 0.0004850481983996621, + "loss": 0.82809931, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.30273438, + "step": 2725, + "time_per_iteration": 2.7439112663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.05174541, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.06296520541747418, + "language_loss": 0.87762207, + "learning_rate": 0.0004847367983379492, + "loss": 0.88844043, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.30053711, + "step": 2726, + "time_per_iteration": 2.497286796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080055, + "balance_loss_mlp": 1.05056226, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.09099502950257793, + "language_loss": 0.78826892, + "learning_rate": 0.00048442540420193643, + "loss": 0.79906946, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.29418945, + "step": 2727, + "time_per_iteration": 2.9191126823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077698, + "balance_loss_mlp": 1.04751396, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.061166777448516674, + "language_loss": 0.79150236, + "learning_rate": 0.0004841140161125182, + "loss": 0.80227935, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.30126953, + "step": 2728, + "time_per_iteration": 3.5845582485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082892, + "balance_loss_mlp": 1.05306578, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.06421237850995067, + "language_loss": 0.84691751, + "learning_rate": 0.0004838026341905857, + "loss": 0.85774648, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.29785156, + "step": 2729, + "time_per_iteration": 2.75872540473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.05010509, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.051610102750965434, + "language_loss": 0.85352898, + "learning_rate": 0.00048349125855702844, + "loss": 0.86433375, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.30322266, + "step": 2730, + "time_per_iteration": 2.7679519653320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108307, + "balance_loss_mlp": 1.05322015, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.05904184367240025, + "language_loss": 0.81296933, + "learning_rate": 0.00048317988933273287, + "loss": 0.82380003, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.29785156, + "step": 2731, + "time_per_iteration": 2.7559163570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079843, + "balance_loss_mlp": 1.0495404, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.06321650060381495, + "language_loss": 0.8227402, + "learning_rate": 0.00048286852663858367, + "loss": 0.83353865, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.30273438, + "step": 2732, + "time_per_iteration": 2.9430267810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077146, + "balance_loss_mlp": 1.04710531, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.05929618739033729, + "language_loss": 0.84009433, + "learning_rate": 0.000482557170595462, + "loss": 0.85086572, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.30004883, + "step": 2733, + "time_per_iteration": 2.914397954940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.05194473, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.05379595829627383, + "language_loss": 0.87649244, + "learning_rate": 0.0004822458213242475, + "loss": 0.88732612, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31396484, + "step": 2734, + "time_per_iteration": 2.533350944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082101, + "balance_loss_mlp": 1.05215609, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.15308762813128413, + "language_loss": 0.85928154, + "learning_rate": 0.00048193447894581627, + "loss": 0.87010252, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.29882812, + "step": 2735, + "time_per_iteration": 3.0971109867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081636, + "balance_loss_mlp": 1.05190539, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.059512944610192846, + "language_loss": 0.88020355, + "learning_rate": 0.00048162314358104243, + "loss": 0.89101994, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.296875, + "step": 2736, + "time_per_iteration": 2.619262456893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.05268502, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.05996263826740056, + "language_loss": 0.83247852, + "learning_rate": 0.0004813118153507969, + "loss": 0.84329623, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.29052734, + "step": 2737, + "time_per_iteration": 2.724499464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.06603909, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.02099488410784391, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83527088, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13964844, + "step": 2738, + "time_per_iteration": 4.7655651569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109097, + "balance_loss_mlp": 1.06135821, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.054521404688675106, + "language_loss": 0.83406657, + "learning_rate": 0.00048068918077736163, + "loss": 0.84497625, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.29541016, + "step": 2739, + "time_per_iteration": 3.2117719650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_mlp": 1.05820239, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.06027403163408104, + "language_loss": 0.81200749, + "learning_rate": 0.0004803778746759001, + "loss": 0.82288492, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.29492188, + "step": 2740, + "time_per_iteration": 2.883953809738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085865, + "balance_loss_mlp": 1.05627775, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.07072803117785999, + "language_loss": 0.81773007, + "learning_rate": 0.00048006657619242317, + "loss": 0.82858872, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.29541016, + "step": 2741, + "time_per_iteration": 2.6289987564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108813, + "balance_loss_mlp": 1.05959105, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.07275993710061575, + "language_loss": 0.78293514, + "learning_rate": 0.00047975528544778775, + "loss": 0.79381645, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.28491211, + "step": 2742, + "time_per_iteration": 2.6370468139648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_mlp": 1.05685973, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.08133754904485412, + "language_loss": 0.88532221, + "learning_rate": 0.00047944400256284754, + "loss": 0.89617908, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.28808594, + "step": 2743, + "time_per_iteration": 2.6988437175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05504286, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.061354637447893066, + "language_loss": 0.8008759, + "learning_rate": 0.0004791327276584532, + "loss": 0.81171608, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.28930664, + "step": 2744, + "time_per_iteration": 2.843850612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092207, + "balance_loss_mlp": 1.0627383, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.06451817982099761, + "language_loss": 0.80512536, + "learning_rate": 0.00047882146085545264, + "loss": 0.81604743, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.29418945, + "step": 2745, + "time_per_iteration": 2.6313765048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059727, + "balance_loss_mlp": 1.04713857, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.01846816151842821, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76462114, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12597656, + "step": 2746, + "time_per_iteration": 4.961829662322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080481, + "balance_loss_mlp": 1.05105972, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.06475941859576588, + "language_loss": 0.79224515, + "learning_rate": 0.00047819895203700684, + "loss": 0.80304992, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29394531, + "step": 2747, + "time_per_iteration": 2.727640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_mlp": 1.03618371, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.01378573653182101, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76561111, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.70350980758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074595, + "balance_loss_mlp": 1.04469705, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.06074589131451646, + "language_loss": 0.88260013, + "learning_rate": 0.0004775764770742277, + "loss": 0.89334607, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29907227, + "step": 2749, + "time_per_iteration": 2.8722305297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.05064785, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.1215004440050613, + "language_loss": 0.86453164, + "learning_rate": 0.00047726525259079777, + "loss": 0.8753407, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.30224609, + "step": 2750, + "time_per_iteration": 2.782618522644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082004, + "balance_loss_mlp": 1.05203521, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.07030365944612293, + "language_loss": 0.88707, + "learning_rate": 0.0004769540369337798, + "loss": 0.89789003, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.29931641, + "step": 2751, + "time_per_iteration": 2.7570507526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078279, + "balance_loss_mlp": 1.04792809, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.06134745452443849, + "language_loss": 0.86018121, + "learning_rate": 0.00047664283022399794, + "loss": 0.87096399, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.3034668, + "step": 2752, + "time_per_iteration": 2.8683836460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070772, + "balance_loss_mlp": 1.04101765, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.061305381303338104, + "language_loss": 0.80927074, + "learning_rate": 0.00047633163258227376, + "loss": 0.81997848, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.29711914, + "step": 2753, + "time_per_iteration": 2.889761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080468, + "balance_loss_mlp": 1.05040383, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.06040690928097006, + "language_loss": 0.85472161, + "learning_rate": 0.0004760204441294247, + "loss": 0.86552632, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.30004883, + "step": 2754, + "time_per_iteration": 2.7022712230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078457, + "balance_loss_mlp": 1.04736757, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.08887078297019954, + "language_loss": 0.85966748, + "learning_rate": 0.00047570926498626486, + "loss": 0.87045205, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31054688, + "step": 2755, + "time_per_iteration": 2.694779396057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083154, + "balance_loss_mlp": 1.05130148, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0527518505260492, + "language_loss": 0.8147307, + "learning_rate": 0.00047539809527360474, + "loss": 0.82556224, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31835938, + "step": 2756, + "time_per_iteration": 2.8726418018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086344, + "balance_loss_mlp": 1.05418181, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.05719732969355854, + "language_loss": 0.82233423, + "learning_rate": 0.0004750869351122511, + "loss": 0.83319771, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.32128906, + "step": 2757, + "time_per_iteration": 2.989522933959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086301, + "balance_loss_mlp": 1.05397129, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0731965335963944, + "language_loss": 0.81977046, + "learning_rate": 0.00047477578462300685, + "loss": 0.83063352, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.32324219, + "step": 2758, + "time_per_iteration": 2.7154197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108253, + "balance_loss_mlp": 1.05153537, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.05716072116198451, + "language_loss": 0.79401624, + "learning_rate": 0.0004744646439266718, + "loss": 0.80484152, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.30957031, + "step": 2759, + "time_per_iteration": 3.010188102722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087952, + "balance_loss_mlp": 1.05719638, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.06513852008932475, + "language_loss": 0.92120409, + "learning_rate": 0.000474153513144041, + "loss": 0.93208361, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.30712891, + "step": 2760, + "time_per_iteration": 2.9100866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090471, + "balance_loss_mlp": 1.05878544, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.05916855301127547, + "language_loss": 0.8678081, + "learning_rate": 0.00047384239239590633, + "loss": 0.87871277, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.31665039, + "step": 2761, + "time_per_iteration": 2.8746495246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108692, + "balance_loss_mlp": 1.05516267, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.06020342742423831, + "language_loss": 0.88611233, + "learning_rate": 0.0004735312818030556, + "loss": 0.8969816, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.31738281, + "step": 2762, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092394, + "balance_loss_mlp": 1.06101847, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.05825845223399112, + "language_loss": 0.82783639, + "learning_rate": 0.0004732201814862727, + "loss": 0.83876032, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31347656, + "step": 2763, + "time_per_iteration": 2.7706046104431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05740237, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.056446972258987926, + "language_loss": 0.81703943, + "learning_rate": 0.0004729090915663373, + "loss": 0.82791865, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.3046875, + "step": 2764, + "time_per_iteration": 2.8320751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_mlp": 1.0584892, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06421691072563727, + "language_loss": 0.85022444, + "learning_rate": 0.00047259801216402534, + "loss": 0.86110902, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.29931641, + "step": 2765, + "time_per_iteration": 2.5070557594299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087661, + "balance_loss_mlp": 1.05735779, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06743519703895742, + "language_loss": 0.86185229, + "learning_rate": 0.00047228694340010845, + "loss": 0.87272882, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.30249023, + "step": 2766, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089224, + "balance_loss_mlp": 1.05918312, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.057283919540088275, + "language_loss": 0.85907435, + "learning_rate": 0.0004719758853953544, + "loss": 0.86996663, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.29980469, + "step": 2767, + "time_per_iteration": 3.598590850830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093331, + "balance_loss_mlp": 1.06419635, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.07956086058885692, + "language_loss": 0.83881301, + "learning_rate": 0.00047166483827052645, + "loss": 0.84974635, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.29125977, + "step": 2768, + "time_per_iteration": 2.4224319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105739, + "balance_loss_mlp": 1.04441977, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.033276153146473426, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78135878, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.12988281, + "step": 2769, + "time_per_iteration": 4.992494583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05961394, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.06372002073291465, + "language_loss": 0.8365072, + "learning_rate": 0.000471042777143682, + "loss": 0.84740394, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.30029297, + "step": 2770, + "time_per_iteration": 3.214010715484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091808, + "balance_loss_mlp": 1.06255412, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.05770492360265134, + "language_loss": 0.79306901, + "learning_rate": 0.0004707317633831707, + "loss": 0.80398703, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.29223633, + "step": 2771, + "time_per_iteration": 2.5814082622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090013, + "balance_loss_mlp": 1.06035328, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.06429055642690477, + "language_loss": 0.78255731, + "learning_rate": 0.00047042076098559673, + "loss": 0.79345745, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.29614258, + "step": 2772, + "time_per_iteration": 2.626574754714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096839, + "balance_loss_mlp": 1.06763303, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.06567346515998468, + "language_loss": 0.73814428, + "learning_rate": 0.00047010977007170174, + "loss": 0.74911261, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.29150391, + "step": 2773, + "time_per_iteration": 3.2639098167419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_mlp": 1.06039929, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06353427502994992, + "language_loss": 0.82705283, + "learning_rate": 0.00046979879076222334, + "loss": 0.83795249, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.29516602, + "step": 2774, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.0655148, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.051161955256212054, + "language_loss": 0.84535086, + "learning_rate": 0.0004694878231778939, + "loss": 0.8562938, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.28759766, + "step": 2775, + "time_per_iteration": 3.37555193901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094093, + "balance_loss_mlp": 1.06471944, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.05222814179658164, + "language_loss": 0.8401432, + "learning_rate": 0.0004691768674394423, + "loss": 0.85108411, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.29321289, + "step": 2776, + "time_per_iteration": 2.992685317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_mlp": 1.01251328, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.010305238226800423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85508353, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.11816406, + "step": 2777, + "time_per_iteration": 4.753941059112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021329, + "balance_loss_mlp": 1.00950325, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.008050007723784799, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77674866, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.11816406, + "step": 2778, + "time_per_iteration": 4.980912923812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.0625428, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.05741424367086941, + "language_loss": 0.79571807, + "learning_rate": 0.00046824407250656676, + "loss": 0.80663168, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.28808594, + "step": 2779, + "time_per_iteration": 2.641680955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109255, + "balance_loss_mlp": 1.06303382, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.05780417685778494, + "language_loss": 0.83320916, + "learning_rate": 0.0004679331653588161, + "loss": 0.84413469, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.29467773, + "step": 2780, + "time_per_iteration": 2.6292784214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_mlp": 1.05741477, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07200473336731207, + "language_loss": 0.8539027, + "learning_rate": 0.0004676222706605147, + "loss": 0.86477172, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.29467773, + "step": 2781, + "time_per_iteration": 2.633302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082924, + "balance_loss_mlp": 1.05355036, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.06052388593462891, + "language_loss": 0.85071301, + "learning_rate": 0.0004673113885323626, + "loss": 0.86154234, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.29321289, + "step": 2782, + "time_per_iteration": 2.8385848999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108118, + "balance_loss_mlp": 1.05152082, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04759682065371887, + "language_loss": 0.78464407, + "learning_rate": 0.00046700051909505494, + "loss": 0.79545587, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.29638672, + "step": 2783, + "time_per_iteration": 3.17055344581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087683, + "balance_loss_mlp": 1.05730867, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06917760310735488, + "language_loss": 0.83446693, + "learning_rate": 0.000466689662469282, + "loss": 0.84534377, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.3034668, + "step": 2784, + "time_per_iteration": 2.6696882247924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080736, + "balance_loss_mlp": 1.05048084, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.0647182284961505, + "language_loss": 0.84010589, + "learning_rate": 0.00046637881877572917, + "loss": 0.85091329, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.30200195, + "step": 2785, + "time_per_iteration": 3.0897059440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107764, + "balance_loss_mlp": 1.04783738, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.2060352755327757, + "language_loss": 0.84354532, + "learning_rate": 0.0004660679881350764, + "loss": 0.85432178, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.29736328, + "step": 2786, + "time_per_iteration": 2.763195753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_mlp": 1.0236131, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.018061436986608354, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76645112, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.13378906, + "step": 2787, + "time_per_iteration": 5.074235677719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.05223989, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0731464482403051, + "language_loss": 0.77922016, + "learning_rate": 0.0004654463664951667, + "loss": 0.79004586, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.30273438, + "step": 2788, + "time_per_iteration": 2.9973762035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086105, + "balance_loss_mlp": 1.05647016, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.06405642217776768, + "language_loss": 0.83215284, + "learning_rate": 0.0004651355757372447, + "loss": 0.84301388, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.2956543, + "step": 2789, + "time_per_iteration": 2.677021026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.05955315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.05726084062519834, + "language_loss": 0.85958302, + "learning_rate": 0.00046482479851489274, + "loss": 0.87048161, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.30273438, + "step": 2790, + "time_per_iteration": 2.6652121543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.05933237, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.07271669587233448, + "language_loss": 0.77731752, + "learning_rate": 0.00046451403494876525, + "loss": 0.78821647, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.30541992, + "step": 2791, + "time_per_iteration": 2.897798776626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090037, + "balance_loss_mlp": 1.05882847, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.06591879115648011, + "language_loss": 0.84175646, + "learning_rate": 0.0004642032851595111, + "loss": 0.8526569, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.31176758, + "step": 2792, + "time_per_iteration": 2.758230209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086262, + "balance_loss_mlp": 1.05543458, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05973481987913333, + "language_loss": 0.84753001, + "learning_rate": 0.00046389254926777404, + "loss": 0.8583926, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.30810547, + "step": 2793, + "time_per_iteration": 2.7933902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086495, + "balance_loss_mlp": 1.05562031, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05136203618868989, + "language_loss": 0.7824527, + "learning_rate": 0.0004635818273941926, + "loss": 0.79331762, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.30859375, + "step": 2794, + "time_per_iteration": 3.564011335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088501, + "balance_loss_mlp": 1.05786383, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.06685314707582615, + "language_loss": 0.81738025, + "learning_rate": 0.0004632711196593997, + "loss": 0.82826525, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.30639648, + "step": 2795, + "time_per_iteration": 2.7609026432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089037, + "balance_loss_mlp": 1.05882931, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.06695327911218095, + "language_loss": 0.85338485, + "learning_rate": 0.00046296042618402297, + "loss": 0.86427522, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.30175781, + "step": 2796, + "time_per_iteration": 3.079580783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.05344939, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.05461778050704968, + "language_loss": 0.79521048, + "learning_rate": 0.0004626497470886839, + "loss": 0.80605042, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30517578, + "step": 2797, + "time_per_iteration": 2.956915855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_mlp": 1.0549171, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.05348634251654363, + "language_loss": 0.81572765, + "learning_rate": 0.00046233908249399897, + "loss": 0.82658887, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.31176758, + "step": 2798, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087806, + "balance_loss_mlp": 1.05781281, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.07296004689367808, + "language_loss": 0.78106725, + "learning_rate": 0.00046202843252057905, + "loss": 0.79194534, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.29956055, + "step": 2799, + "time_per_iteration": 2.615086317062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.05522037, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.056459019467486986, + "language_loss": 0.83738667, + "learning_rate": 0.00046171779728902896, + "loss": 0.84824288, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.3034668, + "step": 2800, + "time_per_iteration": 2.613084077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05025029, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.07411133953793157, + "language_loss": 0.86239338, + "learning_rate": 0.000461407176919948, + "loss": 0.87320936, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.31323242, + "step": 2801, + "time_per_iteration": 2.5331709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078309, + "balance_loss_mlp": 1.04838777, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.07244428600451569, + "language_loss": 0.85469061, + "learning_rate": 0.00046109657153392997, + "loss": 0.86547375, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.29858398, + "step": 2802, + "time_per_iteration": 2.7376809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081766, + "balance_loss_mlp": 1.05007982, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.06487466420670769, + "language_loss": 0.82949483, + "learning_rate": 0.0004607859812515622, + "loss": 0.84031248, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.31665039, + "step": 2803, + "time_per_iteration": 2.601752996444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078317, + "balance_loss_mlp": 1.0476799, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06325281802882306, + "language_loss": 0.87643886, + "learning_rate": 0.00046047540619342667, + "loss": 0.88722193, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.3059082, + "step": 2804, + "time_per_iteration": 2.6036136150360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080625, + "balance_loss_mlp": 1.05056071, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.0581751577303043, + "language_loss": 0.80008459, + "learning_rate": 0.00046016484648009933, + "loss": 0.81089091, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30004883, + "step": 2805, + "time_per_iteration": 2.713219165802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05105305, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.057792621829283776, + "language_loss": 0.80917501, + "learning_rate": 0.0004598543022321501, + "loss": 0.81997907, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.29296875, + "step": 2806, + "time_per_iteration": 2.631939172744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082616, + "balance_loss_mlp": 1.05281353, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.07612886672081497, + "language_loss": 0.79604518, + "learning_rate": 0.0004595437735701433, + "loss": 0.80687129, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.29736328, + "step": 2807, + "time_per_iteration": 2.701808214187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.0507021, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.07694205416949251, + "language_loss": 0.83500147, + "learning_rate": 0.00045923326061463623, + "loss": 0.84581584, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.30688477, + "step": 2808, + "time_per_iteration": 2.7844398021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078771, + "balance_loss_mlp": 1.04725254, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.07660553916433042, + "language_loss": 0.81710881, + "learning_rate": 0.00045892276348618113, + "loss": 0.82789654, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.31494141, + "step": 2809, + "time_per_iteration": 2.982339859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053757, + "balance_loss_mlp": 1.04088223, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.023591100709610114, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.7931459, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12890625, + "step": 2810, + "time_per_iteration": 5.077887296676636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086772, + "balance_loss_mlp": 1.05580163, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.07053414384060859, + "language_loss": 0.80792511, + "learning_rate": 0.000458301817192603, + "loss": 0.81879282, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.30957031, + "step": 2811, + "time_per_iteration": 2.8369667530059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_mlp": 1.02586305, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.019629272648215536, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81880522, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12890625, + "step": 2812, + "time_per_iteration": 4.8166663646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079133, + "balance_loss_mlp": 1.04790044, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.05474211885389724, + "language_loss": 0.86781704, + "learning_rate": 0.00045768093565369983, + "loss": 0.87860835, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31201172, + "step": 2813, + "time_per_iteration": 2.7311370372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081245, + "balance_loss_mlp": 1.05077481, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05950457911446913, + "language_loss": 0.8158434, + "learning_rate": 0.0004573705194685646, + "loss": 0.82665586, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.30444336, + "step": 2814, + "time_per_iteration": 2.733198404312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_mlp": 1.0498848, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.06917969261153488, + "language_loss": 0.84880143, + "learning_rate": 0.00045706011983366157, + "loss": 0.85961473, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.31420898, + "step": 2815, + "time_per_iteration": 2.6939895153045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.04683733, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.08149095023345422, + "language_loss": 0.82716835, + "learning_rate": 0.00045674973686949847, + "loss": 0.83794552, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.30834961, + "step": 2816, + "time_per_iteration": 2.532838821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.045784, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.06493873134640445, + "language_loss": 0.85336345, + "learning_rate": 0.0004564393706965766, + "loss": 0.86413169, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 3.013608455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.04578137, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.06666383117391396, + "language_loss": 0.81068963, + "learning_rate": 0.00045612902143539116, + "loss": 0.82146215, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31469727, + "step": 2818, + "time_per_iteration": 2.605372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070647, + "balance_loss_mlp": 1.03998637, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.07813750406706815, + "language_loss": 0.81324685, + "learning_rate": 0.00045581868920642986, + "loss": 0.82395327, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.30615234, + "step": 2819, + "time_per_iteration": 2.4960100650787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.04709649, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.07920473504276467, + "language_loss": 0.79243749, + "learning_rate": 0.00045550837413017457, + "loss": 0.80321598, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30712891, + "step": 2820, + "time_per_iteration": 2.684987783432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072493, + "balance_loss_mlp": 1.04188037, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.056801171387635116, + "language_loss": 0.85060829, + "learning_rate": 0.0004551980763271005, + "loss": 0.86133325, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30566406, + "step": 2821, + "time_per_iteration": 2.6912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075835, + "balance_loss_mlp": 1.04529333, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.05882616642734503, + "language_loss": 0.83789319, + "learning_rate": 0.0004548877959176756, + "loss": 0.84865159, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.30493164, + "step": 2822, + "time_per_iteration": 2.8441174030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080776, + "balance_loss_mlp": 1.04985332, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.06945933761570218, + "language_loss": 0.86118329, + "learning_rate": 0.00045457753302236166, + "loss": 0.8719911, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30908203, + "step": 2823, + "time_per_iteration": 2.6186442375183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107393, + "balance_loss_mlp": 1.04312599, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.07165023342281863, + "language_loss": 0.87164384, + "learning_rate": 0.00045426728776161353, + "loss": 0.88238311, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30761719, + "step": 2824, + "time_per_iteration": 2.7953178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.05092704, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.05974352124313591, + "language_loss": 0.81803101, + "learning_rate": 0.00045395706025587863, + "loss": 0.8288421, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.30151367, + "step": 2825, + "time_per_iteration": 2.612980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076561, + "balance_loss_mlp": 1.04599547, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.07443979134593931, + "language_loss": 0.8264693, + "learning_rate": 0.00045364685062559843, + "loss": 0.83723497, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30541992, + "step": 2826, + "time_per_iteration": 2.828479051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04630804, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.061142502150282975, + "language_loss": 0.91168308, + "learning_rate": 0.0004533366589912067, + "loss": 0.92245257, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.30615234, + "step": 2827, + "time_per_iteration": 2.970296621322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075368, + "balance_loss_mlp": 1.04599524, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.07414497131093437, + "language_loss": 0.77502602, + "learning_rate": 0.0004530264854731306, + "loss": 0.78577971, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29370117, + "step": 2828, + "time_per_iteration": 3.022944450378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05521488, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.048879345895653556, + "language_loss": 0.84054667, + "learning_rate": 0.00045271633019179034, + "loss": 0.85139751, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.29833984, + "step": 2829, + "time_per_iteration": 2.7760679721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086373, + "balance_loss_mlp": 1.05707121, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.06402410848819869, + "language_loss": 0.87688053, + "learning_rate": 0.0004524061932675986, + "loss": 0.88774425, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.29248047, + "step": 2830, + "time_per_iteration": 2.830350637435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_mlp": 1.05691731, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.06453180665575306, + "language_loss": 0.86766136, + "learning_rate": 0.00045209607482096125, + "loss": 0.87853098, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.30029297, + "step": 2831, + "time_per_iteration": 3.0085608959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082113, + "balance_loss_mlp": 1.05192947, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.06460698711812493, + "language_loss": 0.84066617, + "learning_rate": 0.0004517859749722772, + "loss": 0.85148734, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.30126953, + "step": 2832, + "time_per_iteration": 2.6471612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.04803348, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.09569427913676506, + "language_loss": 0.78785688, + "learning_rate": 0.0004514758938419376, + "loss": 0.79863977, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.30200195, + "step": 2833, + "time_per_iteration": 2.8068594932556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_mlp": 1.02627981, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.016706116470577157, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77958739, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.11865234, + "step": 2834, + "time_per_iteration": 4.907236814498901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.04871142, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.06561437539450005, + "language_loss": 0.83799005, + "learning_rate": 0.00045085578821782175, + "loss": 0.84878516, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.30761719, + "step": 2835, + "time_per_iteration": 2.538837194442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_mlp": 1.02082336, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.016611239115941395, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77167535, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.11962891, + "step": 2836, + "time_per_iteration": 4.947264671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.04765117, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.05618000101860937, + "language_loss": 0.8099249, + "learning_rate": 0.00045023575891159866, + "loss": 0.82071036, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30859375, + "step": 2837, + "time_per_iteration": 2.7390823364257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_mlp": 1.01348448, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.010465474292049673, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75789356, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.12060547, + "step": 2838, + "time_per_iteration": 4.913767576217651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_mlp": 1.05025697, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.053509390521789255, + "language_loss": 0.78084177, + "learning_rate": 0.0004496158068861354, + "loss": 0.7916435, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29882812, + "step": 2839, + "time_per_iteration": 2.816080331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_mlp": 1.05548143, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.05135655646470402, + "language_loss": 0.80302298, + "learning_rate": 0.00044930586015455207, + "loss": 0.81387937, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.30102539, + "step": 2840, + "time_per_iteration": 2.79626727104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087336, + "balance_loss_mlp": 1.05717611, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.05566707414242676, + "language_loss": 0.89057064, + "learning_rate": 0.000448995933104179, + "loss": 0.90144402, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.30102539, + "step": 2841, + "time_per_iteration": 2.8602969646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080566, + "balance_loss_mlp": 1.0502634, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.07080900039808569, + "language_loss": 0.80240697, + "learning_rate": 0.00044868602585534077, + "loss": 0.81321263, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.30297852, + "step": 2842, + "time_per_iteration": 2.9035747051239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078755, + "balance_loss_mlp": 1.04778409, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.061738359719804514, + "language_loss": 0.88582397, + "learning_rate": 0.0004483761385283541, + "loss": 0.89661151, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.30932617, + "step": 2843, + "time_per_iteration": 2.5193030834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074267, + "balance_loss_mlp": 1.04448807, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05447472334615201, + "language_loss": 0.81464523, + "learning_rate": 0.0004480662712435281, + "loss": 0.8253879, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.29736328, + "step": 2844, + "time_per_iteration": 2.731069326400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107206, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.060615817798691185, + "language_loss": 0.8824929, + "learning_rate": 0.0004477564241211635, + "loss": 0.89321351, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.29467773, + "step": 2845, + "time_per_iteration": 2.5875682830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079224, + "balance_loss_mlp": 1.04880142, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.0822753996114188, + "language_loss": 0.86914051, + "learning_rate": 0.0004474465972815541, + "loss": 0.87993276, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.30371094, + "step": 2846, + "time_per_iteration": 2.4777207374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074275, + "balance_loss_mlp": 1.04406786, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.05432348028770475, + "language_loss": 0.87747157, + "learning_rate": 0.000447136790844985, + "loss": 0.88821435, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.30151367, + "step": 2847, + "time_per_iteration": 2.6856186389923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04623675, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.055626256163384374, + "language_loss": 0.81023288, + "learning_rate": 0.00044682700493173385, + "loss": 0.8210023, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.30664062, + "step": 2848, + "time_per_iteration": 2.8167617321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082333, + "balance_loss_mlp": 1.05229259, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.06111415202222153, + "language_loss": 0.80075896, + "learning_rate": 0.00044651723966207004, + "loss": 0.81158233, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.29980469, + "step": 2849, + "time_per_iteration": 3.0959999561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084207, + "balance_loss_mlp": 1.05435705, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.05903862339795778, + "language_loss": 0.78441715, + "learning_rate": 0.00044620749515625536, + "loss": 0.79525924, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.2980957, + "step": 2850, + "time_per_iteration": 2.7892706394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.05001831, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.0673362889441577, + "language_loss": 0.84918725, + "learning_rate": 0.00044589777153454334, + "loss": 0.85998976, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30175781, + "step": 2851, + "time_per_iteration": 2.771003007888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083219, + "balance_loss_mlp": 1.05241561, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05413608872240749, + "language_loss": 0.83428276, + "learning_rate": 0.00044558806891717895, + "loss": 0.84511489, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30761719, + "step": 2852, + "time_per_iteration": 2.499460220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088115, + "balance_loss_mlp": 1.0584085, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.06786065051926819, + "language_loss": 0.79808474, + "learning_rate": 0.0004452783874243998, + "loss": 0.80896592, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.29663086, + "step": 2853, + "time_per_iteration": 2.8307228088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084659, + "balance_loss_mlp": 1.05497599, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06292410009946192, + "language_loss": 0.84795368, + "learning_rate": 0.00044496872717643475, + "loss": 0.85880023, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.29638672, + "step": 2854, + "time_per_iteration": 2.6626110076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.03819215, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.03322747605543158, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78140646, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.13183594, + "step": 2855, + "time_per_iteration": 4.957303285598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.05448246, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.04982994122271322, + "language_loss": 0.81768692, + "learning_rate": 0.0004443494708958217, + "loss": 0.82852638, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.29443359, + "step": 2856, + "time_per_iteration": 3.005343437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088352, + "balance_loss_mlp": 1.0585736, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.04689474861444355, + "language_loss": 0.80522525, + "learning_rate": 0.0004440398751035906, + "loss": 0.8161087, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29736328, + "step": 2857, + "time_per_iteration": 2.868595838546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095367, + "balance_loss_mlp": 1.06659007, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07030492887566664, + "language_loss": 0.83409548, + "learning_rate": 0.00044373030103700645, + "loss": 0.8450492, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.28759766, + "step": 2858, + "time_per_iteration": 2.5910122394561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094102, + "balance_loss_mlp": 1.06508696, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.06946154028242445, + "language_loss": 0.79413795, + "learning_rate": 0.000443420748816257, + "loss": 0.80507904, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28979492, + "step": 2859, + "time_per_iteration": 2.825594663619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06706619, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.06600867884275338, + "language_loss": 0.78576386, + "learning_rate": 0.0004431112185615208, + "loss": 0.79672724, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.29248047, + "step": 2860, + "time_per_iteration": 2.786670446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090723, + "balance_loss_mlp": 1.06154037, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.06889565209263777, + "language_loss": 0.79788846, + "learning_rate": 0.00044280171039296845, + "loss": 0.80879569, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29174805, + "step": 2861, + "time_per_iteration": 2.634674072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.0620054, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.05438680375258401, + "language_loss": 0.88480103, + "learning_rate": 0.0004424922244307616, + "loss": 0.89570987, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.28857422, + "step": 2862, + "time_per_iteration": 2.6849331855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093044, + "balance_loss_mlp": 1.06328964, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06984640427248112, + "language_loss": 0.81865609, + "learning_rate": 0.00044218276079505315, + "loss": 0.82958651, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.29711914, + "step": 2863, + "time_per_iteration": 2.9186837673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.06289792, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.06524866768544495, + "language_loss": 0.74926496, + "learning_rate": 0.0004418733196059876, + "loss": 0.76019078, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29663086, + "step": 2864, + "time_per_iteration": 2.74560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_mlp": 1.05635333, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.056184402553186, + "language_loss": 0.79785758, + "learning_rate": 0.0004415639009837008, + "loss": 0.80870748, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28637695, + "step": 2865, + "time_per_iteration": 2.81969952583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087597, + "balance_loss_mlp": 1.05908251, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.061494004909324176, + "language_loss": 0.81620675, + "learning_rate": 0.00044125450504831955, + "loss": 0.82708275, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.28540039, + "step": 2866, + "time_per_iteration": 2.739954948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085385, + "balance_loss_mlp": 1.05586863, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.07127737838687996, + "language_loss": 0.81880403, + "learning_rate": 0.0004409451319199622, + "loss": 0.82965791, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.29467773, + "step": 2867, + "time_per_iteration": 2.6776282787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.0484705, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.06535442843844029, + "language_loss": 0.84516299, + "learning_rate": 0.0004406357817187381, + "loss": 0.85593313, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.28540039, + "step": 2868, + "time_per_iteration": 3.002542495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05170417, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.05667738365358171, + "language_loss": 0.81411439, + "learning_rate": 0.0004403264545647474, + "loss": 0.82492542, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29370117, + "step": 2869, + "time_per_iteration": 3.523195505142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080839, + "balance_loss_mlp": 1.05196702, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.062383704003679354, + "language_loss": 0.8429901, + "learning_rate": 0.00044001715057808154, + "loss": 0.85379851, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.28808594, + "step": 2870, + "time_per_iteration": 2.759244680404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_mlp": 1.05496836, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05408626919612749, + "language_loss": 0.81631571, + "learning_rate": 0.0004397078698788232, + "loss": 0.82716751, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.30175781, + "step": 2871, + "time_per_iteration": 3.2238638401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_mlp": 1.0167197, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.017765030651381717, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81471765, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12695312, + "step": 2872, + "time_per_iteration": 4.941680431365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084518, + "balance_loss_mlp": 1.05442953, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06021715836391359, + "language_loss": 0.77858603, + "learning_rate": 0.00043908937882281343, + "loss": 0.78943121, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.30029297, + "step": 2873, + "time_per_iteration": 2.6475777626037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_mlp": 1.04845667, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05779342240658392, + "language_loss": 0.82503784, + "learning_rate": 0.0004387801687061814, + "loss": 0.83582854, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.30566406, + "step": 2874, + "time_per_iteration": 2.8554017543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078914, + "balance_loss_mlp": 1.04963589, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.0636526113513214, + "language_loss": 0.80157411, + "learning_rate": 0.0004384709823571958, + "loss": 0.81236321, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.29223633, + "step": 2875, + "time_per_iteration": 2.749535322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076752, + "balance_loss_mlp": 1.04764128, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06015536663517987, + "language_loss": 0.82898968, + "learning_rate": 0.0004381618198958932, + "loss": 0.8397572, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.29052734, + "step": 2876, + "time_per_iteration": 3.518888235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0494318, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05611364502947972, + "language_loss": 0.83295852, + "learning_rate": 0.00043785268144230137, + "loss": 0.84374702, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.29418945, + "step": 2877, + "time_per_iteration": 2.8977479934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078991, + "balance_loss_mlp": 1.04916453, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.07334940017367843, + "language_loss": 0.82020825, + "learning_rate": 0.00043754356711643837, + "loss": 0.83099812, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29785156, + "step": 2878, + "time_per_iteration": 2.6804401874542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080304, + "balance_loss_mlp": 1.04964316, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.0625181232423103, + "language_loss": 0.84172422, + "learning_rate": 0.0004372344770383132, + "loss": 0.85252726, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30615234, + "step": 2879, + "time_per_iteration": 2.80837345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04766345, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.05711228581787917, + "language_loss": 0.82837629, + "learning_rate": 0.00043692541132792507, + "loss": 0.83915067, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29736328, + "step": 2880, + "time_per_iteration": 2.7545833587646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04738569, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.06446598855551679, + "language_loss": 0.83125883, + "learning_rate": 0.00043661637010526384, + "loss": 0.84202665, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.29370117, + "step": 2881, + "time_per_iteration": 2.4907724857330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072171, + "balance_loss_mlp": 1.04139102, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.05841414515956175, + "language_loss": 0.82957321, + "learning_rate": 0.00043630735349031025, + "loss": 0.8402949, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30737305, + "step": 2882, + "time_per_iteration": 2.6922152042388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071624, + "balance_loss_mlp": 1.04101133, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.05422763519754927, + "language_loss": 0.81816816, + "learning_rate": 0.00043599836160303495, + "loss": 0.82888442, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.30566406, + "step": 2883, + "time_per_iteration": 2.861325979232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069587, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05987077775612136, + "language_loss": 0.77311337, + "learning_rate": 0.0004356893945633995, + "loss": 0.78380919, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.30395508, + "step": 2884, + "time_per_iteration": 2.964421510696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070587, + "balance_loss_mlp": 1.03930664, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.16390384373312603, + "language_loss": 0.81600153, + "learning_rate": 0.0004353804524913551, + "loss": 0.82670736, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.3125, + "step": 2885, + "time_per_iteration": 2.6043736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068449, + "balance_loss_mlp": 1.03721642, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.06199045057720987, + "language_loss": 0.81625175, + "learning_rate": 0.0004350715355068441, + "loss": 0.82693619, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.31225586, + "step": 2886, + "time_per_iteration": 2.7229857444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072103, + "balance_loss_mlp": 1.04051256, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06868325666686464, + "language_loss": 0.79814357, + "learning_rate": 0.00043476264372979847, + "loss": 0.80886459, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.31567383, + "step": 2887, + "time_per_iteration": 2.5191705226898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071885, + "balance_loss_mlp": 1.0417012, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.07224884026335429, + "language_loss": 0.78504527, + "learning_rate": 0.0004344537772801408, + "loss": 0.79576409, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.30151367, + "step": 2888, + "time_per_iteration": 3.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_mlp": 1.02040219, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.021049912274883148, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74454963, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12109375, + "step": 2889, + "time_per_iteration": 4.967891216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.04613566, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.06601593716549485, + "language_loss": 0.83441556, + "learning_rate": 0.0004338361208426298, + "loss": 0.84519023, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.31298828, + "step": 2890, + "time_per_iteration": 2.6076786518096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_mlp": 1.0466727, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.05044338716051736, + "language_loss": 0.81248903, + "learning_rate": 0.00043352733109457164, + "loss": 0.82326382, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.30761719, + "step": 2891, + "time_per_iteration": 2.893113136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081411, + "balance_loss_mlp": 1.05148911, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.05185548617134015, + "language_loss": 0.84650671, + "learning_rate": 0.00043321856715349244, + "loss": 0.8573209, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29907227, + "step": 2892, + "time_per_iteration": 2.9470455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05024242, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.060968656189677554, + "language_loss": 0.80153251, + "learning_rate": 0.00043290982913926466, + "loss": 0.81233752, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.30249023, + "step": 2893, + "time_per_iteration": 2.801114559173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.05283189, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.06077441603872835, + "language_loss": 0.83792776, + "learning_rate": 0.0004326011171717514, + "loss": 0.84875673, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30004883, + "step": 2894, + "time_per_iteration": 2.889112710952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077209, + "balance_loss_mlp": 1.04762125, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.06532751979042353, + "language_loss": 0.81112337, + "learning_rate": 0.0004322924313708051, + "loss": 0.82189548, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.29614258, + "step": 2895, + "time_per_iteration": 2.5237138271331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04895401, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.06395509577189365, + "language_loss": 0.84357458, + "learning_rate": 0.0004319837718562681, + "loss": 0.85435069, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.28686523, + "step": 2896, + "time_per_iteration": 2.6235451698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081945, + "balance_loss_mlp": 1.05123627, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.07087835610959153, + "language_loss": 0.82998407, + "learning_rate": 0.0004316751387479726, + "loss": 0.8408035, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30664062, + "step": 2897, + "time_per_iteration": 2.7460193634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081079, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.06734561564060734, + "language_loss": 0.82601708, + "learning_rate": 0.0004313665321657409, + "loss": 0.83682787, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.29882812, + "step": 2898, + "time_per_iteration": 3.700585126876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083979, + "balance_loss_mlp": 1.05393827, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06408348461050545, + "language_loss": 0.79922706, + "learning_rate": 0.00043105795222938436, + "loss": 0.81006682, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.30004883, + "step": 2899, + "time_per_iteration": 2.785468816757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077879, + "balance_loss_mlp": 1.04776657, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.056878366734987945, + "language_loss": 0.78559703, + "learning_rate": 0.00043074939905870467, + "loss": 0.79637581, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.30078125, + "step": 2900, + "time_per_iteration": 2.6782429218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081281, + "balance_loss_mlp": 1.05157411, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.061480860141572814, + "language_loss": 0.806315, + "learning_rate": 0.0004304408727734927, + "loss": 0.81712782, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.296875, + "step": 2901, + "time_per_iteration": 2.6361851692199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089927, + "balance_loss_mlp": 1.05955291, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.045249909626423154, + "language_loss": 0.88812852, + "learning_rate": 0.0004301323734935288, + "loss": 0.89902782, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.3034668, + "step": 2902, + "time_per_iteration": 2.650801181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_mlp": 1.05541265, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.061039385793722846, + "language_loss": 0.87144208, + "learning_rate": 0.000429823901338583, + "loss": 0.88229275, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.29638672, + "step": 2903, + "time_per_iteration": 2.603729486465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108106, + "balance_loss_mlp": 1.05128181, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.060582508535745275, + "language_loss": 0.86712891, + "learning_rate": 0.00042951545642841513, + "loss": 0.87793946, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.29711914, + "step": 2904, + "time_per_iteration": 3.0844316482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05437517, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.055991570648287706, + "language_loss": 0.86597067, + "learning_rate": 0.0004292070388827737, + "loss": 0.87681645, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.30175781, + "step": 2905, + "time_per_iteration": 2.561948537826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082655, + "balance_loss_mlp": 1.0526619, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.06056202554709599, + "language_loss": 0.80913132, + "learning_rate": 0.00042889864882139753, + "loss": 0.81995785, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.29956055, + "step": 2906, + "time_per_iteration": 2.584385871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088672, + "balance_loss_mlp": 1.05913234, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.05654682862292604, + "language_loss": 0.81697655, + "learning_rate": 0.0004285902863640139, + "loss": 0.82786322, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29516602, + "step": 2907, + "time_per_iteration": 2.598034620285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05342221, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.05788374674587666, + "language_loss": 0.85753977, + "learning_rate": 0.00042828195163033966, + "loss": 0.86837995, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.30566406, + "step": 2908, + "time_per_iteration": 2.654411792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.05099869, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.05647224332708591, + "language_loss": 0.79214805, + "learning_rate": 0.0004279736447400812, + "loss": 0.80296183, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30322266, + "step": 2909, + "time_per_iteration": 2.6054940223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05421579, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05245180641385236, + "language_loss": 0.78436708, + "learning_rate": 0.00042766536581293385, + "loss": 0.79521292, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.3034668, + "step": 2910, + "time_per_iteration": 2.735391139984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_mlp": 1.0553261, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.07209314448313818, + "language_loss": 0.79203892, + "learning_rate": 0.0004273571149685819, + "loss": 0.80289924, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30664062, + "step": 2911, + "time_per_iteration": 2.7689387798309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081503, + "balance_loss_mlp": 1.05234432, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.05523073387542819, + "language_loss": 0.8391124, + "learning_rate": 0.00042704889232669937, + "loss": 0.84992743, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29125977, + "step": 2912, + "time_per_iteration": 2.7328362464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082045, + "balance_loss_mlp": 1.05288625, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.0608748772154565, + "language_loss": 0.85180819, + "learning_rate": 0.0004267406980069484, + "loss": 0.8626287, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29150391, + "step": 2913, + "time_per_iteration": 2.6889522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_mlp": 1.05416012, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.0517518520900543, + "language_loss": 0.79621083, + "learning_rate": 0.0004264325321289808, + "loss": 0.80704308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.2902832, + "step": 2914, + "time_per_iteration": 2.7854018211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_mlp": 1.05145359, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.05874282962966631, + "language_loss": 0.86178029, + "learning_rate": 0.00042612439481243736, + "loss": 0.87259024, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.29516602, + "step": 2915, + "time_per_iteration": 2.7484261989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.05264628, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06045457404054478, + "language_loss": 0.89827836, + "learning_rate": 0.00042581628617694735, + "loss": 0.90910184, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.29663086, + "step": 2916, + "time_per_iteration": 2.7450428009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_mlp": 1.05376196, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.06174360046329572, + "language_loss": 0.81716877, + "learning_rate": 0.0004255082063421296, + "loss": 0.82800722, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.30078125, + "step": 2917, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080705, + "balance_loss_mlp": 1.0505209, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.07215647610626674, + "language_loss": 0.85068524, + "learning_rate": 0.00042520015542759065, + "loss": 0.86149234, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.30151367, + "step": 2918, + "time_per_iteration": 2.838871717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083881, + "balance_loss_mlp": 1.05379248, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.06380613116798055, + "language_loss": 0.88105166, + "learning_rate": 0.00042489213355292687, + "loss": 0.89189053, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.30053711, + "step": 2919, + "time_per_iteration": 2.882988214492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081698, + "balance_loss_mlp": 1.0521102, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05903342570268675, + "language_loss": 0.80986512, + "learning_rate": 0.00042458414083772276, + "loss": 0.82068217, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.29541016, + "step": 2920, + "time_per_iteration": 2.520209550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107915, + "balance_loss_mlp": 1.04829907, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.05182413981421792, + "language_loss": 0.85047603, + "learning_rate": 0.000424276177401552, + "loss": 0.86126757, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.30810547, + "step": 2921, + "time_per_iteration": 2.777956008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.04435039, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.05854064719302618, + "language_loss": 0.85700345, + "learning_rate": 0.0004239682433639763, + "loss": 0.86775458, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.30712891, + "step": 2922, + "time_per_iteration": 2.658231019973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074103, + "balance_loss_mlp": 1.04344249, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.07532891292065343, + "language_loss": 0.85277867, + "learning_rate": 0.0004236603388445467, + "loss": 0.86351973, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.30639648, + "step": 2923, + "time_per_iteration": 2.5820417404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073675, + "balance_loss_mlp": 1.04346776, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05777778027932593, + "language_loss": 0.82139969, + "learning_rate": 0.00042335246396280166, + "loss": 0.83213639, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.30151367, + "step": 2924, + "time_per_iteration": 2.7298922538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06950178029529624, + "language_loss": 0.90437222, + "learning_rate": 0.0004230446188382693, + "loss": 0.9151001, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.30761719, + "step": 2925, + "time_per_iteration": 2.533452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.04133308, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.061159313769390204, + "language_loss": 0.80411077, + "learning_rate": 0.0004227368035904654, + "loss": 0.81483406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.30957031, + "step": 2926, + "time_per_iteration": 2.953749895095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04001379, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.05619049718209651, + "language_loss": 0.82702053, + "learning_rate": 0.00042242901833889474, + "loss": 0.83772445, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30322266, + "step": 2927, + "time_per_iteration": 2.6141388416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079835, + "balance_loss_mlp": 1.04977047, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.06403217415420936, + "language_loss": 0.86264247, + "learning_rate": 0.0004221212632030501, + "loss": 0.8734408, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.30004883, + "step": 2928, + "time_per_iteration": 3.0815889835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079959, + "balance_loss_mlp": 1.04953694, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.0586888061552407, + "language_loss": 0.7995134, + "learning_rate": 0.0004218135383024124, + "loss": 0.81031299, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30395508, + "step": 2929, + "time_per_iteration": 2.7041475772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074718, + "balance_loss_mlp": 1.04417634, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.06027811401713532, + "language_loss": 0.84979665, + "learning_rate": 0.0004215058437564511, + "loss": 0.86054391, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.30493164, + "step": 2930, + "time_per_iteration": 2.5627479553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074654, + "balance_loss_mlp": 1.04427934, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.054381619158741505, + "language_loss": 0.8244099, + "learning_rate": 0.00042119817968462397, + "loss": 0.83515644, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.30322266, + "step": 2931, + "time_per_iteration": 2.5824992656707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076007, + "balance_loss_mlp": 1.04517913, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06458971753482587, + "language_loss": 0.86743045, + "learning_rate": 0.0004208905462063766, + "loss": 0.87819058, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.30786133, + "step": 2932, + "time_per_iteration": 2.6889755725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075474, + "balance_loss_mlp": 1.04447937, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.05636003677155103, + "language_loss": 0.84317416, + "learning_rate": 0.00042058294344114315, + "loss": 0.85392892, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.30957031, + "step": 2933, + "time_per_iteration": 2.626492500305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073066, + "balance_loss_mlp": 1.0428108, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05419859074132438, + "language_loss": 0.77552223, + "learning_rate": 0.0004202753715083456, + "loss": 0.78625292, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.30224609, + "step": 2934, + "time_per_iteration": 3.0855889320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.04767334, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.0600578906837947, + "language_loss": 0.81160748, + "learning_rate": 0.0004199678305273936, + "loss": 0.8223865, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30200195, + "step": 2935, + "time_per_iteration": 2.680676221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072428, + "balance_loss_mlp": 1.04176772, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.07403764487671594, + "language_loss": 0.81138289, + "learning_rate": 0.0004196603206176854, + "loss": 0.8221072, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.30615234, + "step": 2936, + "time_per_iteration": 2.930933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_mlp": 1.05526328, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.06763515513860026, + "language_loss": 0.8344292, + "learning_rate": 0.000419352841898607, + "loss": 0.8452751, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29272461, + "step": 2937, + "time_per_iteration": 2.983389377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04714775, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.06159153322850295, + "language_loss": 0.77355075, + "learning_rate": 0.000419045394489532, + "loss": 0.78431857, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29589844, + "step": 2938, + "time_per_iteration": 2.7125768661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082739, + "balance_loss_mlp": 1.05229306, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.051986884313783496, + "language_loss": 0.76774859, + "learning_rate": 0.0004187379785098224, + "loss": 0.77857602, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.30395508, + "step": 2939, + "time_per_iteration": 3.127896547317505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04854691, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05965997721506439, + "language_loss": 0.83921504, + "learning_rate": 0.00041843059407882744, + "loss": 0.85000205, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.30126953, + "step": 2940, + "time_per_iteration": 2.97220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010812, + "balance_loss_mlp": 1.05113554, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05367108270531433, + "language_loss": 0.82534146, + "learning_rate": 0.0004181232413158842, + "loss": 0.83615345, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.30004883, + "step": 2941, + "time_per_iteration": 2.642336368560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_mlp": 1.05405188, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06412651995290534, + "language_loss": 0.82513189, + "learning_rate": 0.0004178159203403179, + "loss": 0.83596516, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29272461, + "step": 2942, + "time_per_iteration": 2.856449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082217, + "balance_loss_mlp": 1.05260575, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.056771241115104176, + "language_loss": 0.81273901, + "learning_rate": 0.0004175086312714409, + "loss": 0.82356119, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.2956543, + "step": 2943, + "time_per_iteration": 2.62709903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088098, + "balance_loss_mlp": 1.05898714, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.050224853353863855, + "language_loss": 0.83679438, + "learning_rate": 0.00041720137422855366, + "loss": 0.84767538, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.29052734, + "step": 2944, + "time_per_iteration": 2.730576515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_mlp": 1.05710077, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.0578384318096137, + "language_loss": 0.78684467, + "learning_rate": 0.00041689414933094383, + "loss": 0.79770631, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.2902832, + "step": 2945, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_mlp": 1.05483007, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.061631419209263724, + "language_loss": 0.80986917, + "learning_rate": 0.00041658695669788653, + "loss": 0.82071877, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.30102539, + "step": 2946, + "time_per_iteration": 2.766889810562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083037, + "balance_loss_mlp": 1.05352092, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.08686938236765575, + "language_loss": 0.81373537, + "learning_rate": 0.00041627979644864453, + "loss": 0.82456571, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.29467773, + "step": 2947, + "time_per_iteration": 2.7937870025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085685, + "balance_loss_mlp": 1.0563122, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.05686002455066826, + "language_loss": 0.81299067, + "learning_rate": 0.0004159726687024683, + "loss": 0.82384753, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.29345703, + "step": 2948, + "time_per_iteration": 2.636784791946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05417752, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.057207156589959604, + "language_loss": 0.7857877, + "learning_rate": 0.00041566557357859506, + "loss": 0.79662293, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.29321289, + "step": 2949, + "time_per_iteration": 2.8607821464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.05131269, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.050618871180039625, + "language_loss": 0.79166919, + "learning_rate": 0.0004153585111962502, + "loss": 0.802477, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.29443359, + "step": 2950, + "time_per_iteration": 3.306715250015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05387974, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.08196542197504524, + "language_loss": 0.84189069, + "learning_rate": 0.0004150514816746453, + "loss": 0.85272491, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.29492188, + "step": 2951, + "time_per_iteration": 2.6732659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_mlp": 1.05190265, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.06474663434913709, + "language_loss": 0.85581088, + "learning_rate": 0.0004147444851329802, + "loss": 0.86662048, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29003906, + "step": 2952, + "time_per_iteration": 2.647568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079758, + "balance_loss_mlp": 1.05081391, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.0574748240063073, + "language_loss": 0.85410154, + "learning_rate": 0.00041443752169044126, + "loss": 0.8648991, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.28955078, + "step": 2953, + "time_per_iteration": 3.018815040588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081341, + "balance_loss_mlp": 1.05227828, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05380576703697579, + "language_loss": 0.846789, + "learning_rate": 0.0004141305914662025, + "loss": 0.85760248, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.29052734, + "step": 2954, + "time_per_iteration": 2.7356324195861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088016, + "balance_loss_mlp": 1.05807066, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.05392421630137883, + "language_loss": 0.80538452, + "learning_rate": 0.0004138236945794246, + "loss": 0.81626463, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.29907227, + "step": 2955, + "time_per_iteration": 2.8904106616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082907, + "balance_loss_mlp": 1.05439222, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.07320613099583566, + "language_loss": 0.83898306, + "learning_rate": 0.00041351683114925576, + "loss": 0.84981215, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.28491211, + "step": 2956, + "time_per_iteration": 3.0756330490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_mlp": 1.05683398, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.05933823821942172, + "language_loss": 0.86556458, + "learning_rate": 0.0004132100012948308, + "loss": 0.87642407, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.29077148, + "step": 2957, + "time_per_iteration": 2.6803860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.05614674, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.06187903851247569, + "language_loss": 0.84050244, + "learning_rate": 0.00041290320513527145, + "loss": 0.85135645, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.29248047, + "step": 2958, + "time_per_iteration": 2.54225754737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05545211, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04955077863713089, + "language_loss": 0.85089266, + "learning_rate": 0.0004125964427896867, + "loss": 0.86173952, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29199219, + "step": 2959, + "time_per_iteration": 2.716848611831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.0530802, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.0635030186812047, + "language_loss": 0.79277623, + "learning_rate": 0.0004122897143771723, + "loss": 0.80361056, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.30297852, + "step": 2960, + "time_per_iteration": 2.53230357170105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086179, + "balance_loss_mlp": 1.05628169, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.052407613892641675, + "language_loss": 0.81192493, + "learning_rate": 0.0004119830200168109, + "loss": 0.82278675, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.29858398, + "step": 2961, + "time_per_iteration": 2.684126377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.05355775, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.06121192976286501, + "language_loss": 0.88053119, + "learning_rate": 0.0004116763598276714, + "loss": 0.89136672, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.29956055, + "step": 2962, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108181, + "balance_loss_mlp": 1.05138803, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.069996546899228, + "language_loss": 0.8081792, + "learning_rate": 0.00041136973392881017, + "loss": 0.81899732, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.30395508, + "step": 2963, + "time_per_iteration": 2.8093085289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05357933, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.06390032386968057, + "language_loss": 0.8227576, + "learning_rate": 0.00041106314243926983, + "loss": 0.8335923, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.29858398, + "step": 2964, + "time_per_iteration": 2.740004062652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080188, + "balance_loss_mlp": 1.05062366, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.060533570265575896, + "language_loss": 0.87250763, + "learning_rate": 0.0004107565854780798, + "loss": 0.88330954, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29516602, + "step": 2965, + "time_per_iteration": 2.6749136447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080245, + "balance_loss_mlp": 1.05111039, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.06664541213513904, + "language_loss": 0.80888879, + "learning_rate": 0.000410450063164256, + "loss": 0.81969118, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29077148, + "step": 2966, + "time_per_iteration": 2.8448963165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081067, + "balance_loss_mlp": 1.05081153, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.06804112412049489, + "language_loss": 0.82108605, + "learning_rate": 0.00041014357561680115, + "loss": 0.83189678, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30200195, + "step": 2967, + "time_per_iteration": 2.5226550102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_mlp": 1.0544889, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.059986306134107735, + "language_loss": 0.86107051, + "learning_rate": 0.0004098371229547039, + "loss": 0.87191176, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.29589844, + "step": 2968, + "time_per_iteration": 2.7232651710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_mlp": 1.03398585, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.025451731838023718, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81057, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.12207031, + "step": 2969, + "time_per_iteration": 4.785320997238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082869, + "balance_loss_mlp": 1.05330527, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.07178133530641487, + "language_loss": 0.80500889, + "learning_rate": 0.00040922432276247107, + "loss": 0.81583756, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.29516602, + "step": 2970, + "time_per_iteration": 2.5877230167388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086085, + "balance_loss_mlp": 1.05635428, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.05561639186548029, + "language_loss": 0.84452176, + "learning_rate": 0.0004089179754702457, + "loss": 0.85538256, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.29663086, + "step": 2971, + "time_per_iteration": 2.759932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084469, + "balance_loss_mlp": 1.05469072, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.05716809371830958, + "language_loss": 0.79499936, + "learning_rate": 0.00040861166353919843, + "loss": 0.80584407, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.29711914, + "step": 2972, + "time_per_iteration": 2.856147050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080407, + "balance_loss_mlp": 1.05213094, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.054720530113361164, + "language_loss": 0.81279707, + "learning_rate": 0.00040830538708824983, + "loss": 0.82360113, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.28295898, + "step": 2973, + "time_per_iteration": 2.9099643230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05414152, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.059341772904328634, + "language_loss": 0.81557322, + "learning_rate": 0.000407999146236307, + "loss": 0.82641, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29492188, + "step": 2974, + "time_per_iteration": 2.5506579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_mlp": 1.05807054, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.05823834072467256, + "language_loss": 0.8320694, + "learning_rate": 0.0004076929411022634, + "loss": 0.84294319, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.29248047, + "step": 2975, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.05125356, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.059359253337435705, + "language_loss": 0.79102635, + "learning_rate": 0.0004073867718049982, + "loss": 0.80183673, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.29736328, + "step": 2976, + "time_per_iteration": 3.104320526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087781, + "balance_loss_mlp": 1.05745435, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.06002278348442279, + "language_loss": 0.82387239, + "learning_rate": 0.00040708063846337704, + "loss": 0.83475018, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.30273438, + "step": 2977, + "time_per_iteration": 2.7141377925872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.05906403, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.05629415234265891, + "language_loss": 0.81140733, + "learning_rate": 0.00040677454119625143, + "loss": 0.82229173, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.29321289, + "step": 2978, + "time_per_iteration": 2.5579118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.04967451, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.06287623577372331, + "language_loss": 0.82978582, + "learning_rate": 0.0004064684801224587, + "loss": 0.84058082, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.2980957, + "step": 2979, + "time_per_iteration": 2.6184630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080607, + "balance_loss_mlp": 1.05047131, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.049858532305801305, + "language_loss": 0.80364764, + "learning_rate": 0.00040616245536082224, + "loss": 0.81445372, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30078125, + "step": 2980, + "time_per_iteration": 2.605652093887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.04602742, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.05649585275193457, + "language_loss": 0.81399214, + "learning_rate": 0.00040585646703015165, + "loss": 0.82474685, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29418945, + "step": 2981, + "time_per_iteration": 2.8440651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081482, + "balance_loss_mlp": 1.05103636, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.0633133856450646, + "language_loss": 0.78068441, + "learning_rate": 0.0004055505152492419, + "loss": 0.79149926, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.30419922, + "step": 2982, + "time_per_iteration": 2.7125117778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076312, + "balance_loss_mlp": 1.0467, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.057765721767923175, + "language_loss": 0.74208528, + "learning_rate": 0.00040524460013687425, + "loss": 0.75284839, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.29589844, + "step": 2983, + "time_per_iteration": 2.7232775688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05151832, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.049591997410844156, + "language_loss": 0.81157619, + "learning_rate": 0.0004049387218118155, + "loss": 0.82238322, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.29199219, + "step": 2984, + "time_per_iteration": 2.956636428833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080147, + "balance_loss_mlp": 1.04934323, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.06847869877575175, + "language_loss": 0.84987867, + "learning_rate": 0.00040463288039281777, + "loss": 0.8606801, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30761719, + "step": 2985, + "time_per_iteration": 2.7503554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00078201, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.012095267017415088, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78889978, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.12792969, + "step": 2986, + "time_per_iteration": 5.030332565307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079255, + "balance_loss_mlp": 1.04981041, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.055809040190366505, + "language_loss": 0.82136881, + "learning_rate": 0.0004040213087479444, + "loss": 0.83216131, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.29443359, + "step": 2987, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087088, + "balance_loss_mlp": 1.05816782, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.06868722002267488, + "language_loss": 0.85331053, + "learning_rate": 0.0004037155787595018, + "loss": 0.8641814, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.28857422, + "step": 2988, + "time_per_iteration": 2.561497211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085606, + "balance_loss_mlp": 1.05599451, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.05119655910511677, + "language_loss": 0.80321741, + "learning_rate": 0.000403409886151987, + "loss": 0.81407344, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29589844, + "step": 2989, + "time_per_iteration": 2.9114019870758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013296, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.008836939301122537, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83012402, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.12695312, + "step": 2990, + "time_per_iteration": 4.770756483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_mlp": 1.00086439, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.007697309180098509, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79211962, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.125, + "step": 2991, + "time_per_iteration": 4.786288499832153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_mlp": 1.05537939, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05348004588160335, + "language_loss": 0.76926208, + "learning_rate": 0.00040249303380173807, + "loss": 0.78009981, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.28369141, + "step": 2992, + "time_per_iteration": 3.0660438537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_mlp": 1.05629849, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.06048493616630367, + "language_loss": 0.79311389, + "learning_rate": 0.00040218749190459126, + "loss": 0.80396485, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.28808594, + "step": 2993, + "time_per_iteration": 2.7251527309417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084541, + "balance_loss_mlp": 1.05514371, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.0697186971943442, + "language_loss": 0.82477212, + "learning_rate": 0.00040188198798162775, + "loss": 0.83561754, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29370117, + "step": 2994, + "time_per_iteration": 2.6159136295318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05147123, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.057556686362034246, + "language_loss": 0.85848254, + "learning_rate": 0.000401576522151455, + "loss": 0.86929381, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29614258, + "step": 2995, + "time_per_iteration": 2.811438798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05775023, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04540215088386673, + "language_loss": 0.82446247, + "learning_rate": 0.0004012710945326651, + "loss": 0.83532608, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.28613281, + "step": 2996, + "time_per_iteration": 2.778818368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.05790055, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.049519109180824444, + "language_loss": 0.81129038, + "learning_rate": 0.0004009657052438355, + "loss": 0.82215673, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28686523, + "step": 2997, + "time_per_iteration": 2.8787920475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094954, + "balance_loss_mlp": 1.06612968, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.05906428447956742, + "language_loss": 0.85482752, + "learning_rate": 0.00040066035440352904, + "loss": 0.86577708, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.2878418, + "step": 2998, + "time_per_iteration": 2.634565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.03379035, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.021537766013807906, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80338895, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.11962891, + "step": 2999, + "time_per_iteration": 4.964475393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090784, + "balance_loss_mlp": 1.06248331, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.06837432109358414, + "language_loss": 0.75964624, + "learning_rate": 0.00040004976854266145, + "loss": 0.77055407, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.28295898, + "step": 3000, + "time_per_iteration": 2.5489282608032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.06006408, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.0545980885089623, + "language_loss": 0.81222647, + "learning_rate": 0.0003997445337591505, + "loss": 0.82312131, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.29370117, + "step": 3001, + "time_per_iteration": 2.6890947818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108546, + "balance_loss_mlp": 1.05680251, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.06583721131765849, + "language_loss": 0.74093473, + "learning_rate": 0.0003994393378982635, + "loss": 0.75178933, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28662109, + "step": 3002, + "time_per_iteration": 2.596644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_mlp": 1.03153443, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.017943105040569007, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80581129, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11572266, + "step": 3003, + "time_per_iteration": 4.826138257980347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085564, + "balance_loss_mlp": 1.05666792, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.058273014851323426, + "language_loss": 0.87901747, + "learning_rate": 0.0003988290634182961, + "loss": 0.88987309, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.28881836, + "step": 3004, + "time_per_iteration": 2.7604172229766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06015372, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.06327449394997672, + "language_loss": 0.80677181, + "learning_rate": 0.0003985239850361453, + "loss": 0.81765187, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.27856445, + "step": 3005, + "time_per_iteration": 2.5994105339050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.06256592, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.057065414052448256, + "language_loss": 0.84621793, + "learning_rate": 0.0003982189460504777, + "loss": 0.85713327, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.28930664, + "step": 3006, + "time_per_iteration": 2.722778797149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.06261778, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.0654169545720973, + "language_loss": 0.79183024, + "learning_rate": 0.00039791394657971935, + "loss": 0.80274087, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.28442383, + "step": 3007, + "time_per_iteration": 2.7318689823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.06056237, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.06429658550493057, + "language_loss": 0.84402883, + "learning_rate": 0.00039760898674228205, + "loss": 0.85492396, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.28930664, + "step": 3008, + "time_per_iteration": 2.6548941135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.05884826, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.0525681924040606, + "language_loss": 0.80782068, + "learning_rate": 0.0003973040666565613, + "loss": 0.81869543, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.28588867, + "step": 3009, + "time_per_iteration": 3.065049171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087663, + "balance_loss_mlp": 1.05972004, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.058928126410829465, + "language_loss": 0.81879556, + "learning_rate": 0.000396999186440938, + "loss": 0.82967222, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.27954102, + "step": 3010, + "time_per_iteration": 2.860755205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086781, + "balance_loss_mlp": 1.05871928, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06775550082118927, + "language_loss": 0.84739363, + "learning_rate": 0.000396694346213777, + "loss": 0.85826147, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.28076172, + "step": 3011, + "time_per_iteration": 2.591801643371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077556, + "balance_loss_mlp": 1.04815888, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.09075774540794283, + "language_loss": 0.83682388, + "learning_rate": 0.0003963895460934276, + "loss": 0.84759945, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.29370117, + "step": 3012, + "time_per_iteration": 3.1549274921417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_mlp": 1.05242133, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.07824771870324425, + "language_loss": 0.85031927, + "learning_rate": 0.00039608478619822376, + "loss": 0.86112702, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.28344727, + "step": 3013, + "time_per_iteration": 2.436859369277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108003, + "balance_loss_mlp": 1.05091906, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.07454312954276684, + "language_loss": 0.82720006, + "learning_rate": 0.00039578006664648394, + "loss": 0.83800036, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.29125977, + "step": 3014, + "time_per_iteration": 2.813934326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.05350864, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.07429538018047967, + "language_loss": 0.81169355, + "learning_rate": 0.0003954753875565105, + "loss": 0.82251996, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.29101562, + "step": 3015, + "time_per_iteration": 3.089141607284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.04674578, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.053240000714227444, + "language_loss": 0.8237859, + "learning_rate": 0.00039517074904659057, + "loss": 0.8345452, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.29125977, + "step": 3016, + "time_per_iteration": 2.7315711975097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.05217314, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0618256833307492, + "language_loss": 0.84621388, + "learning_rate": 0.00039486615123499535, + "loss": 0.85702527, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.28955078, + "step": 3017, + "time_per_iteration": 2.870152235031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082579, + "balance_loss_mlp": 1.05342066, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.06092979313789558, + "language_loss": 0.85065556, + "learning_rate": 0.00039456159423997996, + "loss": 0.86148143, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.29125977, + "step": 3018, + "time_per_iteration": 2.6494932174682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04867649, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.05170574080230249, + "language_loss": 0.89520943, + "learning_rate": 0.00039425707817978406, + "loss": 0.90599209, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29541016, + "step": 3019, + "time_per_iteration": 2.690485715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_mlp": 1.04894376, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06031161665678942, + "language_loss": 0.83372945, + "learning_rate": 0.00039395260317263124, + "loss": 0.84451568, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.29663086, + "step": 3020, + "time_per_iteration": 2.677818775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076598, + "balance_loss_mlp": 1.0466764, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.056782275650517425, + "language_loss": 0.84907949, + "learning_rate": 0.0003936481693367291, + "loss": 0.8598454, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.29882812, + "step": 3021, + "time_per_iteration": 2.647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_mlp": 1.05491698, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06733027879749674, + "language_loss": 0.87502337, + "learning_rate": 0.0003933437767902697, + "loss": 0.88587123, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.29833984, + "step": 3022, + "time_per_iteration": 2.825965166091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085273, + "balance_loss_mlp": 1.05706787, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.07318564796931465, + "language_loss": 0.78165317, + "learning_rate": 0.00039303942565142825, + "loss": 0.79250592, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.28222656, + "step": 3023, + "time_per_iteration": 2.7315845489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_mlp": 1.0569042, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.052544940996134284, + "language_loss": 0.76741624, + "learning_rate": 0.0003927351160383644, + "loss": 0.77829051, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.3046875, + "step": 3024, + "time_per_iteration": 2.789477825164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085705, + "balance_loss_mlp": 1.05609322, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.07634686348045291, + "language_loss": 0.77796662, + "learning_rate": 0.000392430848069222, + "loss": 0.78882366, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.5446279048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085632, + "balance_loss_mlp": 1.05549598, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05528071963535831, + "language_loss": 0.82223105, + "learning_rate": 0.00039212662186212795, + "loss": 0.83308738, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.30078125, + "step": 3026, + "time_per_iteration": 2.60878849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_mlp": 1.04883003, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.05052748911564131, + "language_loss": 0.76906562, + "learning_rate": 0.0003918224375351934, + "loss": 0.77986145, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.30737305, + "step": 3027, + "time_per_iteration": 2.709887742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_mlp": 1.05384469, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05874903473435042, + "language_loss": 0.78473544, + "learning_rate": 0.0003915182952065135, + "loss": 0.79556859, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29418945, + "step": 3028, + "time_per_iteration": 2.6885859966278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.05250072, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.06824855227929012, + "language_loss": 0.8751812, + "learning_rate": 0.0003912141949941664, + "loss": 0.88600326, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.296875, + "step": 3029, + "time_per_iteration": 2.7145774364471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05799532, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.07682913079591057, + "language_loss": 0.82808822, + "learning_rate": 0.0003909101370162143, + "loss": 0.83896548, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.29711914, + "step": 3030, + "time_per_iteration": 2.6085238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063086, + "balance_loss_mlp": 1.05116475, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.03433679117263603, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73496974, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.11914062, + "step": 3031, + "time_per_iteration": 4.894438028335571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05076766, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.0542485247275347, + "language_loss": 0.8270607, + "learning_rate": 0.0003903021482356622, + "loss": 0.83786714, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29833984, + "step": 3032, + "time_per_iteration": 2.8060503005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079071, + "balance_loss_mlp": 1.04924476, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.06913224268253564, + "language_loss": 0.8243112, + "learning_rate": 0.00038999821766910465, + "loss": 0.8351019, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.2980957, + "step": 3033, + "time_per_iteration": 3.013117551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079849, + "balance_loss_mlp": 1.04992783, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.06539568057172108, + "language_loss": 0.85596031, + "learning_rate": 0.00038969432980902606, + "loss": 0.86675882, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.29907227, + "step": 3034, + "time_per_iteration": 2.602159261703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.03642654, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.02505289654727371, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.8083204, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11132812, + "step": 3035, + "time_per_iteration": 4.8551225662231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05664897, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.05971096981290547, + "language_loss": 0.82545829, + "learning_rate": 0.00038908668268020953, + "loss": 0.8363204, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29516602, + "step": 3036, + "time_per_iteration": 2.6712634563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084003, + "balance_loss_mlp": 1.05455875, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.06020630991976339, + "language_loss": 0.84750116, + "learning_rate": 0.00038878292364738097, + "loss": 0.85834116, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.29418945, + "step": 3037, + "time_per_iteration": 2.774688959121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087202, + "balance_loss_mlp": 1.05785298, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.06330434972052289, + "language_loss": 0.87235534, + "learning_rate": 0.0003884792077928508, + "loss": 0.88322735, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.511212110519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05957842, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.089824175631678, + "language_loss": 0.76556516, + "learning_rate": 0.0003881755352345322, + "loss": 0.77645469, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29345703, + "step": 3039, + "time_per_iteration": 2.5297422409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108977, + "balance_loss_mlp": 1.06039691, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.05409760120739159, + "language_loss": 0.8652333, + "learning_rate": 0.0003878719060903207, + "loss": 0.87613106, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29345703, + "step": 3040, + "time_per_iteration": 2.5606369972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_mlp": 1.05447245, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.07864155094531469, + "language_loss": 0.83092105, + "learning_rate": 0.0003875683204780961, + "loss": 0.84176469, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29833984, + "step": 3041, + "time_per_iteration": 2.7069876194000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_mlp": 1.06128943, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.07084084705837652, + "language_loss": 0.85393965, + "learning_rate": 0.00038726477851572043, + "loss": 0.86485463, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.30175781, + "step": 3042, + "time_per_iteration": 2.785623788833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086169, + "balance_loss_mlp": 1.0566287, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.06883779110535396, + "language_loss": 0.80354905, + "learning_rate": 0.0003869612803210395, + "loss": 0.81441069, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.29541016, + "step": 3043, + "time_per_iteration": 2.635880708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075998, + "balance_loss_mlp": 1.04643369, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.0705585022393511, + "language_loss": 0.83492166, + "learning_rate": 0.0003866578260118817, + "loss": 0.84568161, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29541016, + "step": 3044, + "time_per_iteration": 2.58337664604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074571, + "balance_loss_mlp": 1.04491138, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.06598081480709424, + "language_loss": 0.83220106, + "learning_rate": 0.0003863544157060581, + "loss": 0.84294677, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.29614258, + "step": 3045, + "time_per_iteration": 2.66916561126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079474, + "balance_loss_mlp": 1.04998136, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.05207738102195899, + "language_loss": 0.82137144, + "learning_rate": 0.0003860510495213634, + "loss": 0.83216619, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.29492188, + "step": 3046, + "time_per_iteration": 2.8170437812805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04256272, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.07713217072038757, + "language_loss": 0.78373164, + "learning_rate": 0.0003857477275755746, + "loss": 0.79445338, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.29589844, + "step": 3047, + "time_per_iteration": 2.639801502227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077446, + "balance_loss_mlp": 1.04678559, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.05564403415338841, + "language_loss": 0.84011877, + "learning_rate": 0.00038544444998645167, + "loss": 0.8508932, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.30639648, + "step": 3048, + "time_per_iteration": 3.007289409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_mlp": 1.04754782, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.06801965614795764, + "language_loss": 0.81586641, + "learning_rate": 0.00038514121687173767, + "loss": 0.8266356, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.29345703, + "step": 3049, + "time_per_iteration": 2.637277603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072965, + "balance_loss_mlp": 1.04397368, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.0576990751755922, + "language_loss": 0.81892288, + "learning_rate": 0.00038483802834915807, + "loss": 0.82965243, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.28979492, + "step": 3050, + "time_per_iteration": 2.975592613220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075399, + "balance_loss_mlp": 1.04607356, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.09338183491699942, + "language_loss": 0.78599441, + "learning_rate": 0.00038453488453642074, + "loss": 0.79674846, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29296875, + "step": 3051, + "time_per_iteration": 2.668680429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_mlp": 1.04581618, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.18186948375192843, + "language_loss": 0.86825669, + "learning_rate": 0.00038423178555121697, + "loss": 0.87900746, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.29223633, + "step": 3052, + "time_per_iteration": 2.7119386196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080518, + "balance_loss_mlp": 1.05202711, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.05190046933032045, + "language_loss": 0.85228276, + "learning_rate": 0.00038392873151121994, + "loss": 0.86308795, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.28466797, + "step": 3053, + "time_per_iteration": 3.0532052516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075316, + "balance_loss_mlp": 1.04615784, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.06073215036153007, + "language_loss": 0.830441, + "learning_rate": 0.0003836257225340859, + "loss": 0.84119415, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.29125977, + "step": 3054, + "time_per_iteration": 2.6791739463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077784, + "balance_loss_mlp": 1.04922152, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.053654559033963406, + "language_loss": 0.82283098, + "learning_rate": 0.00038332275873745336, + "loss": 0.83360887, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.28564453, + "step": 3055, + "time_per_iteration": 3.0826737880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085261, + "balance_loss_mlp": 1.05646038, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.07874067829632751, + "language_loss": 0.82649648, + "learning_rate": 0.0003830198402389431, + "loss": 0.83734912, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.28759766, + "step": 3056, + "time_per_iteration": 2.71244215965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.06755841, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.03508304466376378, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78429663, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.13183594, + "step": 3057, + "time_per_iteration": 4.991718053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.05900002, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.0604575145753954, + "language_loss": 0.83162987, + "learning_rate": 0.0003824141396066855, + "loss": 0.84250164, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28198242, + "step": 3058, + "time_per_iteration": 2.62410044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095213, + "balance_loss_mlp": 1.06605411, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05748148757470156, + "language_loss": 0.83195531, + "learning_rate": 0.000382111357708092, + "loss": 0.84290743, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29125977, + "step": 3059, + "time_per_iteration": 2.741142511367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099933, + "balance_loss_mlp": 1.07113242, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.07210182052791281, + "language_loss": 0.83736324, + "learning_rate": 0.00038180862157792864, + "loss": 0.84836257, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.28808594, + "step": 3060, + "time_per_iteration": 2.8028531074523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095663, + "balance_loss_mlp": 1.06733847, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.06185538750618477, + "language_loss": 0.82032192, + "learning_rate": 0.0003815059313337279, + "loss": 0.83127856, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28295898, + "step": 3061, + "time_per_iteration": 2.661663055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.0641377, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.054152956568787894, + "language_loss": 0.78217703, + "learning_rate": 0.00038120328709300436, + "loss": 0.7931028, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.28466797, + "step": 3062, + "time_per_iteration": 2.8524019718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_mlp": 1.0717572, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.07045144115382113, + "language_loss": 0.83619386, + "learning_rate": 0.0003809006889732549, + "loss": 0.84719896, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.28759766, + "step": 3063, + "time_per_iteration": 2.818297863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093698, + "balance_loss_mlp": 1.06554079, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.07166208719676233, + "language_loss": 0.87752122, + "learning_rate": 0.0003805981370919589, + "loss": 0.88845825, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28173828, + "step": 3064, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06352103, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.052273370645306905, + "language_loss": 0.83554685, + "learning_rate": 0.0003802956315665771, + "loss": 0.84646511, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28320312, + "step": 3065, + "time_per_iteration": 2.7017621994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091683, + "balance_loss_mlp": 1.06428885, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.09115739101573021, + "language_loss": 0.81856883, + "learning_rate": 0.0003799931725145529, + "loss": 0.82948571, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.27416992, + "step": 3066, + "time_per_iteration": 2.6396725177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091771, + "balance_loss_mlp": 1.0635426, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.061744960378181175, + "language_loss": 0.85826695, + "learning_rate": 0.00037969076005331083, + "loss": 0.86918467, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28271484, + "step": 3067, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05947697, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.062191843713449865, + "language_loss": 0.87458771, + "learning_rate": 0.00037938839430025817, + "loss": 0.88547218, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.28930664, + "step": 3068, + "time_per_iteration": 2.645289897918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080639, + "balance_loss_mlp": 1.0527916, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.07692636502028646, + "language_loss": 0.85409123, + "learning_rate": 0.0003790860753727835, + "loss": 0.86489761, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.27856445, + "step": 3069, + "time_per_iteration": 2.831932544708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.05966043, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.05698566021180351, + "language_loss": 0.82950222, + "learning_rate": 0.00037878380338825766, + "loss": 0.84037948, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28076172, + "step": 3070, + "time_per_iteration": 2.6856610774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094092, + "balance_loss_mlp": 1.06655455, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.05699607440456078, + "language_loss": 0.81377411, + "learning_rate": 0.00037848157846403287, + "loss": 0.82471496, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.27539062, + "step": 3071, + "time_per_iteration": 2.9222235679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090999, + "balance_loss_mlp": 1.06291366, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04993960868235579, + "language_loss": 0.8303259, + "learning_rate": 0.0003781794007174435, + "loss": 0.84123588, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.28076172, + "step": 3072, + "time_per_iteration": 2.8049426078796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.03702164, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.02139881306535856, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7512219, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.860798597335815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05854619, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.0539637393269004, + "language_loss": 0.81219113, + "learning_rate": 0.0003775751872264152, + "loss": 0.8230564, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.28027344, + "step": 3074, + "time_per_iteration": 2.7820684909820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05267119, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.057314841017187666, + "language_loss": 0.87226552, + "learning_rate": 0.0003772731517165527, + "loss": 0.88307905, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28686523, + "step": 3075, + "time_per_iteration": 2.8264849185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.05383801, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06214529816255618, + "language_loss": 0.83813703, + "learning_rate": 0.0003769711638534784, + "loss": 0.84896386, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28857422, + "step": 3076, + "time_per_iteration": 2.9739084243774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107611, + "balance_loss_mlp": 1.04769087, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06330128127303343, + "language_loss": 0.78904676, + "learning_rate": 0.00037666922375443446, + "loss": 0.79980791, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28417969, + "step": 3077, + "time_per_iteration": 2.611528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076959, + "balance_loss_mlp": 1.04815805, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.0824489675783013, + "language_loss": 0.81633419, + "learning_rate": 0.00037636733153664396, + "loss": 0.82710373, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.2878418, + "step": 3078, + "time_per_iteration": 2.830021619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074589, + "balance_loss_mlp": 1.04547811, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.07220859459639119, + "language_loss": 0.79744393, + "learning_rate": 0.0003760654873173124, + "loss": 0.80818975, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.29077148, + "step": 3079, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069705, + "balance_loss_mlp": 1.04047441, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.0611483797885387, + "language_loss": 0.81661952, + "learning_rate": 0.00037576369121362566, + "loss": 0.82731652, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.29174805, + "step": 3080, + "time_per_iteration": 2.6135458946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073309, + "balance_loss_mlp": 1.0437448, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05261928263256693, + "language_loss": 0.81494981, + "learning_rate": 0.0003754619433427516, + "loss": 0.82568288, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29516602, + "step": 3081, + "time_per_iteration": 2.935394763946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_mlp": 1.04502153, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.07109600442573788, + "language_loss": 0.77291781, + "learning_rate": 0.0003751602438218392, + "loss": 0.78366369, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.29516602, + "step": 3082, + "time_per_iteration": 2.762129306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107369, + "balance_loss_mlp": 1.04410219, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.07081310094320947, + "language_loss": 0.83719951, + "learning_rate": 0.0003748585927680186, + "loss": 0.84793639, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.29589844, + "step": 3083, + "time_per_iteration": 2.6607072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072302, + "balance_loss_mlp": 1.04126024, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.09668658910416093, + "language_loss": 0.82859874, + "learning_rate": 0.00037455699029840086, + "loss": 0.83932179, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.31005859, + "step": 3084, + "time_per_iteration": 2.641989231109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069753, + "balance_loss_mlp": 1.04014122, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.04958887884439868, + "language_loss": 0.84485245, + "learning_rate": 0.0003742554365300787, + "loss": 0.85554999, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.2956543, + "step": 3085, + "time_per_iteration": 2.8070170879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.0440923, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.06324229056117828, + "language_loss": 0.78341657, + "learning_rate": 0.0003739539315801255, + "loss": 0.79416168, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.30371094, + "step": 3086, + "time_per_iteration": 2.937530755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076236, + "balance_loss_mlp": 1.04571867, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.06251001537840323, + "language_loss": 0.91790974, + "learning_rate": 0.000373652475565596, + "loss": 0.92867219, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.3046875, + "step": 3087, + "time_per_iteration": 2.484830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072731, + "balance_loss_mlp": 1.0422616, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.06825336960690286, + "language_loss": 0.81144977, + "learning_rate": 0.00037335106860352587, + "loss": 0.82217705, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.3046875, + "step": 3088, + "time_per_iteration": 2.705796003341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079924, + "balance_loss_mlp": 1.04938293, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.05943406802659928, + "language_loss": 0.83409536, + "learning_rate": 0.00037304971081093146, + "loss": 0.84489465, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.30517578, + "step": 3089, + "time_per_iteration": 2.5424582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_mlp": 1.05015349, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.06149863143832335, + "language_loss": 0.80616403, + "learning_rate": 0.00037274840230481024, + "loss": 0.81697237, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.30664062, + "step": 3090, + "time_per_iteration": 2.7081451416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073853, + "balance_loss_mlp": 1.04407477, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.06332669517454644, + "language_loss": 0.79229522, + "learning_rate": 0.00037244714320214077, + "loss": 0.80303377, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.29736328, + "step": 3091, + "time_per_iteration": 2.5389420986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.05082965, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.061471299239273844, + "language_loss": 0.83137572, + "learning_rate": 0.000372145933619882, + "loss": 0.84218347, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.29931641, + "step": 3092, + "time_per_iteration": 2.8748533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076811, + "balance_loss_mlp": 1.04657912, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05871713315937548, + "language_loss": 0.82114685, + "learning_rate": 0.000371844773674974, + "loss": 0.8319149, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.30224609, + "step": 3093, + "time_per_iteration": 2.6465840339660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_mlp": 1.05346692, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.0642067113719601, + "language_loss": 0.81621695, + "learning_rate": 0.0003715436634843375, + "loss": 0.82704508, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29345703, + "step": 3094, + "time_per_iteration": 2.9084014892578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079615, + "balance_loss_mlp": 1.05007505, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.04814703484993394, + "language_loss": 0.80545932, + "learning_rate": 0.00037124260316487355, + "loss": 0.81625545, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.29516602, + "step": 3095, + "time_per_iteration": 2.8632538318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075577, + "balance_loss_mlp": 1.04727709, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.060441576418101065, + "language_loss": 0.89618301, + "learning_rate": 0.0003709415928334643, + "loss": 0.90693879, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.28344727, + "step": 3096, + "time_per_iteration": 2.6276299953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_mlp": 1.04813242, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.06311167084488892, + "language_loss": 0.80587751, + "learning_rate": 0.00037064063260697233, + "loss": 0.81665254, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.29345703, + "step": 3097, + "time_per_iteration": 2.893503427505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081151, + "balance_loss_mlp": 1.05151534, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.06048648768573219, + "language_loss": 0.78276408, + "learning_rate": 0.0003703397226022407, + "loss": 0.79357558, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.2956543, + "step": 3098, + "time_per_iteration": 3.0289156436920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_mlp": 1.02305758, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.01734603550218104, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76534188, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.11230469, + "step": 3099, + "time_per_iteration": 4.946389436721802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078376, + "balance_loss_mlp": 1.04978967, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.05865367248717621, + "language_loss": 0.83124352, + "learning_rate": 0.0003697380537253339, + "loss": 0.84202731, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.28564453, + "step": 3100, + "time_per_iteration": 2.674445152282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083272, + "balance_loss_mlp": 1.05492401, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.050984632699602635, + "language_loss": 0.81265384, + "learning_rate": 0.0003694372950867471, + "loss": 0.82348651, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28369141, + "step": 3101, + "time_per_iteration": 2.787538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075715, + "balance_loss_mlp": 1.04772449, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05184746467501943, + "language_loss": 0.77182555, + "learning_rate": 0.0003691365871370976, + "loss": 0.78258264, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.2800293, + "step": 3102, + "time_per_iteration": 3.016934871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080662, + "balance_loss_mlp": 1.05271935, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.06482068820490762, + "language_loss": 0.85340202, + "learning_rate": 0.00036883592999313093, + "loss": 0.8642087, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27978516, + "step": 3103, + "time_per_iteration": 2.689819812774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079629, + "balance_loss_mlp": 1.05218673, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.06496745505902583, + "language_loss": 0.79311585, + "learning_rate": 0.0003685353237715722, + "loss": 0.8039121, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27490234, + "step": 3104, + "time_per_iteration": 2.87333083152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_mlp": 1.05504966, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.051730016495621756, + "language_loss": 0.8144263, + "learning_rate": 0.0003682347685891274, + "loss": 0.82525891, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.28222656, + "step": 3105, + "time_per_iteration": 2.888319730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080866, + "balance_loss_mlp": 1.05228007, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.060164631065922125, + "language_loss": 0.80393469, + "learning_rate": 0.0003679342645624822, + "loss": 0.8147434, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.28564453, + "step": 3106, + "time_per_iteration": 3.0317325592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.0513438, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.057913897832382336, + "language_loss": 0.81649029, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728529, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.28198242, + "step": 3107, + "time_per_iteration": 2.9762744903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_mlp": 1.05519295, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05706871104479872, + "language_loss": 0.79560876, + "learning_rate": 0.0003673334104432347, + "loss": 0.80644441, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.28393555, + "step": 3108, + "time_per_iteration": 2.5976645946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.0530827, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.06092677674045173, + "language_loss": 0.83641863, + "learning_rate": 0.0003670330605839048, + "loss": 0.84723055, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.28125, + "step": 3109, + "time_per_iteration": 2.819420337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082632, + "balance_loss_mlp": 1.05480886, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.0537112811211955, + "language_loss": 0.76695013, + "learning_rate": 0.0003667327623469191, + "loss": 0.77777648, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27832031, + "step": 3110, + "time_per_iteration": 2.766671657562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085165, + "balance_loss_mlp": 1.05753255, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.058546063064310164, + "language_loss": 0.77618361, + "learning_rate": 0.00036643251584886333, + "loss": 0.78703523, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27661133, + "step": 3111, + "time_per_iteration": 2.789184808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077786, + "balance_loss_mlp": 1.05105901, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.054896589550954444, + "language_loss": 0.81872785, + "learning_rate": 0.00036613232120630393, + "loss": 0.82950568, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.26782227, + "step": 3112, + "time_per_iteration": 2.5881965160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081611, + "balance_loss_mlp": 1.05362022, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.07437964171487202, + "language_loss": 0.80355418, + "learning_rate": 0.00036583217853578643, + "loss": 0.81437027, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.27978516, + "step": 3113, + "time_per_iteration": 2.5409529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05457568, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.06261379626444472, + "language_loss": 0.77366924, + "learning_rate": 0.000365532087953837, + "loss": 0.78449941, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.28442383, + "step": 3114, + "time_per_iteration": 3.6426267623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076465, + "balance_loss_mlp": 1.04842734, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.08299057980597005, + "language_loss": 0.88937151, + "learning_rate": 0.00036523204957696065, + "loss": 0.90013611, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.28051758, + "step": 3115, + "time_per_iteration": 2.594581365585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_mlp": 1.05623841, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.06140193987839019, + "language_loss": 0.80620509, + "learning_rate": 0.00036493206352164324, + "loss": 0.81704283, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.27612305, + "step": 3116, + "time_per_iteration": 2.922367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076912, + "balance_loss_mlp": 1.04942214, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05345315057842072, + "language_loss": 0.85505688, + "learning_rate": 0.000364632129904349, + "loss": 0.86582601, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.27514648, + "step": 3117, + "time_per_iteration": 2.765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077238, + "balance_loss_mlp": 1.04884195, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05997451129778301, + "language_loss": 0.77705157, + "learning_rate": 0.00036433224884152283, + "loss": 0.78782398, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.28393555, + "step": 3118, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078485, + "balance_loss_mlp": 1.05032814, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.06439508839737945, + "language_loss": 0.77913392, + "learning_rate": 0.00036403242044958875, + "loss": 0.78991878, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28173828, + "step": 3119, + "time_per_iteration": 2.5515971183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.04563642, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05980235429893482, + "language_loss": 0.91155994, + "learning_rate": 0.0003637326448449507, + "loss": 0.9222945, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.27832031, + "step": 3120, + "time_per_iteration": 2.7075581550598145 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 260120304, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7074473936158720.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/training_args.bin b/sft_pretrain/Full_smoe_share/checkpoint-3120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-3120/zero_to_fp32.py b/sft_pretrain/Full_smoe_share/checkpoint-3120/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-3120/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/added_tokens.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/config.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9b0c4407eef6bd7d8c22453f95c43fd6ef0981 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/generation_config.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b119292ca90cc5fdee82640d2f6905f3208fdbd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82a1e49d796ce02204f8291343278b6e9acc4a837c04395e309b1f57ff10a0fe +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c681a43b989fefc11c7fdee44836354d5889f100 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:641670c942bf13e467e4f565bd751ed2350909df94ff9249c22820452687e22b +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..29c5265871b7c9c9f3b812f646d55f62ebef4df3 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5e34b133b22d6bb779469a7f7c3e47c58d1cc0a10593b5a12c4e6357d6e691d +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdd7b95ddb961daad5191ba2147f38ad78c8a2cc --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e272309d154a2a728025970e918098502fe97b5deacaf1669a6805ac2c7f83a +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f18deca0ae15f926a825788a8fa12ee2be79950b --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0c2df3ac7371f435e427c6ed80a2ae0c55734dd6d8d014d9d22483fc8244e3 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..35940b65314dcae551499ddff687e5696d273ff9 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb1082d3338cc2b175f41ad34239c90eadfff79f907e577051db34f2ed181ef +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0aabcc96e1f029272e2ec90c0340161f8093a5a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a0dcb7591135986ed041e623cb589796cf7719903e4aa28ea37c7196bb48f0a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..25dddf57348426295054920f28fad4a812f5b734 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/global_step4160/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b3ad0bf6ee2e7792b833dbd0fce4cacc6dd490883f522b9342b5cec512369d +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/latest b/sft_pretrain/Full_smoe_share/checkpoint-4160/latest new file mode 100644 index 0000000000000000000000000000000000000000..ae01dfd535e9ee314b565695c1d61230ecf4c494 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/latest @@ -0,0 +1 @@ +global_step4160 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8d7298e4d2772c8e8d1345effd65d8733f588451 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7278cb0885ab2e7bd12611c8c81ddf3f3a2798c07d5958f2799c876c7edbea23 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_0.pth b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_1.pth b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_2.pth b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_3.pth b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/special_tokens_map.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer.model b/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer_config.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/trainer_state.json b/sft_pretrain/Full_smoe_share/checkpoint-4160/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fb94ca46fe41237063781834cb349316e1c4aa63 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/trainer_state.json @@ -0,0 +1,62433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003078106964217, + "eval_steps": 500, + "global_step": 4160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.10615349, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.07166596959521815, + "language_loss": 0.84091032, + "learning_rate": 0.000925888133132719, + "loss": 0.852319, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.34741211, + "step": 1041, + "time_per_iteration": 2.791015148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724521, + "balance_loss_mlp": 1.67225933, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.16089622263247963, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8133496, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.5234375, + "step": 1042, + "time_per_iteration": 4.978717565536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116458, + "balance_loss_mlp": 1.08169639, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.06766738281342395, + "language_loss": 0.80769098, + "learning_rate": 0.0009255613649386244, + "loss": 0.81885552, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.34790039, + "step": 1043, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_mlp": 1.08709943, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.07361728486384381, + "language_loss": 0.78999138, + "learning_rate": 0.0009253977329834838, + "loss": 0.80121642, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.35449219, + "step": 1044, + "time_per_iteration": 2.7036681175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_mlp": 1.07227719, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.08623717161971375, + "language_loss": 0.86596096, + "learning_rate": 0.0009252339358742965, + "loss": 0.87704492, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.36108398, + "step": 1045, + "time_per_iteration": 2.874620199203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118791, + "balance_loss_mlp": 1.08369565, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.06963930913543727, + "language_loss": 0.82984746, + "learning_rate": 0.000925069973674654, + "loss": 0.84103537, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.35107422, + "step": 1046, + "time_per_iteration": 2.628878116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_mlp": 1.07017231, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.07870556033127275, + "language_loss": 0.88610631, + "learning_rate": 0.000924905846448212, + "loss": 0.89716709, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.35913086, + "step": 1047, + "time_per_iteration": 2.747220754623413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0750165, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.10747792176710873, + "language_loss": 0.85372317, + "learning_rate": 0.0009247415542586906, + "loss": 0.86482, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34667969, + "step": 1048, + "time_per_iteration": 2.8556973934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.08285666, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.2214820598260846, + "language_loss": 0.83177209, + "learning_rate": 0.0009245770971698735, + "loss": 0.84296525, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.36450195, + "step": 1049, + "time_per_iteration": 2.9050869941711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132964, + "balance_loss_mlp": 1.09798741, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.08175342307012821, + "language_loss": 0.88327754, + "learning_rate": 0.0009244124752456087, + "loss": 0.89460719, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.34985352, + "step": 1050, + "time_per_iteration": 2.5141613483428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151097, + "balance_loss_mlp": 1.11557305, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.06393011823673703, + "language_loss": 0.85371649, + "learning_rate": 0.0009242476885498081, + "loss": 0.86522746, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.35522461, + "step": 1051, + "time_per_iteration": 2.727687358856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176333, + "balance_loss_mlp": 1.14171457, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.09914193731013146, + "language_loss": 0.80802011, + "learning_rate": 0.0009240827371464474, + "loss": 0.81978351, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.34643555, + "step": 1052, + "time_per_iteration": 2.552121877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191475, + "balance_loss_mlp": 1.15521157, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.1023503287046967, + "language_loss": 0.83863074, + "learning_rate": 0.0009239176210995666, + "loss": 0.85054547, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.36230469, + "step": 1053, + "time_per_iteration": 3.47882342338562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190284, + "balance_loss_mlp": 1.15561819, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.09115683042396579, + "language_loss": 0.93677175, + "learning_rate": 0.0009237523404732695, + "loss": 0.94867456, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34692383, + "step": 1054, + "time_per_iteration": 2.8701720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173476, + "balance_loss_mlp": 1.13838029, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.10782024136876088, + "language_loss": 0.8421399, + "learning_rate": 0.0009235868953317235, + "loss": 0.85387468, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.3515625, + "step": 1055, + "time_per_iteration": 2.8210723400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161281, + "balance_loss_mlp": 1.12682986, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.07346272336072437, + "language_loss": 0.85227096, + "learning_rate": 0.0009234212857391602, + "loss": 0.86388373, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.3449707, + "step": 1056, + "time_per_iteration": 3.2212936878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153084, + "balance_loss_mlp": 1.11727369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.054845505201833546, + "language_loss": 0.89240777, + "learning_rate": 0.000923255511759875, + "loss": 0.90393853, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.3581543, + "step": 1057, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156175, + "balance_loss_mlp": 1.12146115, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.10969304378799022, + "language_loss": 0.84913409, + "learning_rate": 0.000923089573458227, + "loss": 0.86069584, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.34716797, + "step": 1058, + "time_per_iteration": 2.8832740783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.1168946, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.24205150411640483, + "language_loss": 0.83790255, + "learning_rate": 0.0009229234708986392, + "loss": 0.84941626, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.3449707, + "step": 1059, + "time_per_iteration": 2.8837289810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01633401, + "balance_loss_mlp": 1.57885134, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.08953482343612705, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83300292, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.546875, + "step": 1060, + "time_per_iteration": 4.667459011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158699, + "balance_loss_mlp": 1.1247009, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.0736942782322193, + "language_loss": 0.84963936, + "learning_rate": 0.0009225907732636548, + "loss": 0.86122632, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.34033203, + "step": 1061, + "time_per_iteration": 2.7532095909118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_mlp": 1.12954497, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.09512005659435491, + "language_loss": 0.8641578, + "learning_rate": 0.0009224241783174227, + "loss": 0.87580323, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.35009766, + "step": 1062, + "time_per_iteration": 2.683047294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147761, + "balance_loss_mlp": 1.11347604, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.07955707081408017, + "language_loss": 0.85456479, + "learning_rate": 0.0009222574193715802, + "loss": 0.86604244, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.34326172, + "step": 1063, + "time_per_iteration": 2.8293161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_mlp": 1.10474837, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.08617592440024102, + "language_loss": 0.85715151, + "learning_rate": 0.000922090496490869, + "loss": 0.8685447, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.34619141, + "step": 1064, + "time_per_iteration": 2.749298334121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.08865011, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.06572729358097257, + "language_loss": 0.89767212, + "learning_rate": 0.0009219234097400937, + "loss": 0.90891409, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.35595703, + "step": 1065, + "time_per_iteration": 2.8508355617523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_mlp": 1.07175696, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.05918330788086957, + "language_loss": 0.82970631, + "learning_rate": 0.0009217561591841237, + "loss": 0.8407777, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.35400391, + "step": 1066, + "time_per_iteration": 3.3216452598571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102073, + "balance_loss_mlp": 1.06566656, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09526156176010836, + "language_loss": 0.81088316, + "learning_rate": 0.0009215887448878913, + "loss": 0.82190394, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.36401367, + "step": 1067, + "time_per_iteration": 2.596022129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06191611, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.072135210200994, + "language_loss": 0.84963661, + "learning_rate": 0.0009214211669163922, + "loss": 0.86063439, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.37841797, + "step": 1068, + "time_per_iteration": 4.440082311630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096187, + "balance_loss_mlp": 1.05923223, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.07010547570027807, + "language_loss": 0.93398243, + "learning_rate": 0.0009212534253346862, + "loss": 0.94494426, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.36938477, + "step": 1069, + "time_per_iteration": 2.699843406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096083, + "balance_loss_mlp": 1.05912852, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.07799270520419531, + "language_loss": 0.83685625, + "learning_rate": 0.0009210855202078964, + "loss": 0.84781706, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.36962891, + "step": 1070, + "time_per_iteration": 2.5999720096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010932, + "balance_loss_mlp": 1.05810475, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.0723710550133871, + "language_loss": 0.86933672, + "learning_rate": 0.0009209174516012091, + "loss": 0.88026869, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.35131836, + "step": 1071, + "time_per_iteration": 2.503551483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.05794883, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.05962541016594441, + "language_loss": 0.88928151, + "learning_rate": 0.0009207492195798747, + "loss": 0.90020716, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.34667969, + "step": 1072, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094226, + "balance_loss_mlp": 1.05972731, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.06398863953592046, + "language_loss": 0.84846818, + "learning_rate": 0.0009205808242092061, + "loss": 0.85941041, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34521484, + "step": 1073, + "time_per_iteration": 2.644134044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_mlp": 1.06080186, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.06666861242543158, + "language_loss": 0.82488537, + "learning_rate": 0.0009204122655545808, + "loss": 0.83583593, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34277344, + "step": 1074, + "time_per_iteration": 3.3254919052124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.07582152, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.0719401545163873, + "language_loss": 0.81125832, + "learning_rate": 0.0009202435436814388, + "loss": 0.82235849, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.34228516, + "step": 1075, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105303, + "balance_loss_mlp": 1.0707798, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.06775779875999222, + "language_loss": 0.89715004, + "learning_rate": 0.0009200746586552836, + "loss": 0.90820301, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 2.897177219390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_mlp": 1.06869972, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.12065235325240355, + "language_loss": 0.83624744, + "learning_rate": 0.0009199056105416825, + "loss": 0.84727275, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33862305, + "step": 1077, + "time_per_iteration": 3.0771028995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_mlp": 1.07218289, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.06486814220319007, + "language_loss": 0.8622663, + "learning_rate": 0.0009197363994062654, + "loss": 0.8733272, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.33935547, + "step": 1078, + "time_per_iteration": 2.807009696960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112785, + "balance_loss_mlp": 1.07914448, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.06985523034062016, + "language_loss": 0.84313667, + "learning_rate": 0.0009195670253147262, + "loss": 0.85426456, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.33642578, + "step": 1079, + "time_per_iteration": 2.9738564491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114515, + "balance_loss_mlp": 1.0817802, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.09202653272357895, + "language_loss": 0.81912923, + "learning_rate": 0.0009193974883328216, + "loss": 0.8302744, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32739258, + "step": 1080, + "time_per_iteration": 2.639878511428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121501, + "balance_loss_mlp": 1.08721614, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.059797822691547486, + "language_loss": 0.86745334, + "learning_rate": 0.0009192277885263718, + "loss": 0.87866837, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.34326172, + "step": 1081, + "time_per_iteration": 4.060026407241821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.08671248, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.0682125291941454, + "language_loss": 0.86169523, + "learning_rate": 0.0009190579259612602, + "loss": 0.87289995, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33789062, + "step": 1082, + "time_per_iteration": 3.2795815467834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134326, + "balance_loss_mlp": 1.10132933, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.06852391956291448, + "language_loss": 0.86675245, + "learning_rate": 0.000918887900703433, + "loss": 0.87809569, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.33007812, + "step": 1083, + "time_per_iteration": 2.813777208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137242, + "balance_loss_mlp": 1.1025995, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.07184608102087402, + "language_loss": 0.90139276, + "learning_rate": 0.0009187177128188999, + "loss": 0.91276515, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.34667969, + "step": 1084, + "time_per_iteration": 2.4950854778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361857, + "balance_loss_mlp": 1.30883229, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.057507491560350586, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78518397, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.53125, + "step": 1085, + "time_per_iteration": 4.9323132038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.08279717, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.0734883897044225, + "language_loss": 0.85634506, + "learning_rate": 0.000918376849434071, + "loss": 0.86751348, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.34057617, + "step": 1086, + "time_per_iteration": 2.504467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110856, + "balance_loss_mlp": 1.07680964, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07305298195252904, + "language_loss": 0.90630972, + "learning_rate": 0.0009182061740661098, + "loss": 0.91741836, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34057617, + "step": 1087, + "time_per_iteration": 2.5760254859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_mlp": 1.0785315, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05349746945174757, + "language_loss": 0.84760422, + "learning_rate": 0.0009180353363361127, + "loss": 0.85873878, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.34912109, + "step": 1088, + "time_per_iteration": 3.0988333225250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111767, + "balance_loss_mlp": 1.07593286, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.0658577902216117, + "language_loss": 0.81715566, + "learning_rate": 0.0009178643363104044, + "loss": 0.82827336, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.35864258, + "step": 1089, + "time_per_iteration": 3.1410629749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_mlp": 1.07155704, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.10460691940838339, + "language_loss": 0.90569937, + "learning_rate": 0.0009176931740553735, + "loss": 0.91676497, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.35009766, + "step": 1090, + "time_per_iteration": 2.529330253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112911, + "balance_loss_mlp": 1.07698107, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.07113631656774884, + "language_loss": 0.82557011, + "learning_rate": 0.0009175218496374708, + "loss": 0.83669925, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.359375, + "step": 1091, + "time_per_iteration": 3.347742795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110472, + "balance_loss_mlp": 1.07356465, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.08284412758413852, + "language_loss": 0.85813856, + "learning_rate": 0.0009173503631232103, + "loss": 0.86924326, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.36914062, + "step": 1092, + "time_per_iteration": 3.378859758377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.06684804, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.09413161778101656, + "language_loss": 0.81595004, + "learning_rate": 0.0009171787145791691, + "loss": 0.82698447, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.36621094, + "step": 1093, + "time_per_iteration": 3.215574026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_mlp": 1.06214595, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.0806437411167059, + "language_loss": 0.80327773, + "learning_rate": 0.000917006904071987, + "loss": 0.81427377, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.37451172, + "step": 1094, + "time_per_iteration": 2.6117537021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_mlp": 1.06377053, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.08991830585001004, + "language_loss": 0.87576157, + "learning_rate": 0.0009168349316683669, + "loss": 0.88676262, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.36352539, + "step": 1095, + "time_per_iteration": 2.740950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_mlp": 1.06650949, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.06267137937039592, + "language_loss": 0.8218863, + "learning_rate": 0.0009166627974350741, + "loss": 0.83290446, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.35327148, + "step": 1096, + "time_per_iteration": 2.887326240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098665, + "balance_loss_mlp": 1.06206763, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.07019696164219995, + "language_loss": 0.89238816, + "learning_rate": 0.0009164905014389373, + "loss": 0.90337479, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.3659668, + "step": 1097, + "time_per_iteration": 2.7609455585479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105326, + "balance_loss_mlp": 1.06908655, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.06528725154368942, + "language_loss": 0.8638711, + "learning_rate": 0.0009163180437468476, + "loss": 0.87492442, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.36254883, + "step": 1098, + "time_per_iteration": 2.5998973846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096402, + "balance_loss_mlp": 1.06009042, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.06547964129234486, + "language_loss": 0.85908926, + "learning_rate": 0.000916145424425759, + "loss": 0.87005323, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.36303711, + "step": 1099, + "time_per_iteration": 2.6804425716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06601155, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.08063804967749887, + "language_loss": 0.90475744, + "learning_rate": 0.0009159726435426885, + "loss": 0.91577733, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.35986328, + "step": 1100, + "time_per_iteration": 3.1017394065856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_mlp": 1.06499124, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.08023517310436831, + "language_loss": 0.90250683, + "learning_rate": 0.0009157997011647154, + "loss": 0.9135161, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.359375, + "step": 1101, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_mlp": 1.06045425, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05508329212621071, + "language_loss": 0.86001104, + "learning_rate": 0.0009156265973589817, + "loss": 0.87097728, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.36206055, + "step": 1102, + "time_per_iteration": 2.7933261394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097006, + "balance_loss_mlp": 1.06121981, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.06583201442001711, + "language_loss": 0.89802408, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899414, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.35791016, + "step": 1103, + "time_per_iteration": 2.647494316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096343, + "balance_loss_mlp": 1.0598892, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06603869229078199, + "language_loss": 0.87027407, + "learning_rate": 0.0009152799057331156, + "loss": 0.88123751, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.36499023, + "step": 1104, + "time_per_iteration": 3.1623916625976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097231, + "balance_loss_mlp": 1.06134939, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.07161611233178561, + "language_loss": 0.90831178, + "learning_rate": 0.0009151063180475805, + "loss": 0.91928405, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.35913086, + "step": 1105, + "time_per_iteration": 2.5515594482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099591, + "balance_loss_mlp": 1.06516361, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.08899576142412509, + "language_loss": 0.83941323, + "learning_rate": 0.0009149325692034803, + "loss": 0.85040915, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.34472656, + "step": 1106, + "time_per_iteration": 2.561875343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300575, + "balance_loss_mlp": 1.25708735, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.05662804479307553, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80504, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.43554688, + "step": 1107, + "time_per_iteration": 4.880220174789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_mlp": 1.06870413, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.06711298172071122, + "language_loss": 0.87037283, + "learning_rate": 0.0009145845883094678, + "loss": 0.88141322, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.35375977, + "step": 1108, + "time_per_iteration": 3.0598409175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_mlp": 1.06931639, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.06803775359788228, + "language_loss": 0.8464098, + "learning_rate": 0.000914410356394654, + "loss": 0.85746086, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.35839844, + "step": 1109, + "time_per_iteration": 2.776258945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_mlp": 1.06799972, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.052025780444459935, + "language_loss": 0.84733951, + "learning_rate": 0.0009142359635914709, + "loss": 0.85837853, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.35913086, + "step": 1110, + "time_per_iteration": 3.057307243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096278, + "balance_loss_mlp": 1.05996692, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.10914443694781037, + "language_loss": 0.84286684, + "learning_rate": 0.0009140614099676245, + "loss": 0.85382962, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.36328125, + "step": 1111, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.0517633, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.09545242357915729, + "language_loss": 0.82540983, + "learning_rate": 0.0009138866955908821, + "loss": 0.83628869, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.36132812, + "step": 1112, + "time_per_iteration": 2.870765209197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_mlp": 1.06445658, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06321568237144509, + "language_loss": 0.8048408, + "learning_rate": 0.0009137118205290738, + "loss": 0.8158437, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.35864258, + "step": 1113, + "time_per_iteration": 4.381570100784302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_mlp": 1.06091869, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06328361159326604, + "language_loss": 0.89779603, + "learning_rate": 0.0009135367848500924, + "loss": 0.90876651, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.36157227, + "step": 1114, + "time_per_iteration": 2.511164665222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.06034184, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.08987717155463379, + "language_loss": 0.86417669, + "learning_rate": 0.0009133615886218927, + "loss": 0.87514299, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.36303711, + "step": 1115, + "time_per_iteration": 2.7101125717163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089806, + "balance_loss_mlp": 1.05337584, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.07119429557645003, + "language_loss": 0.87869287, + "learning_rate": 0.0009131862319124917, + "loss": 0.88959092, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.36425781, + "step": 1116, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.05648971, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06965010238630005, + "language_loss": 0.83447617, + "learning_rate": 0.0009130107147899691, + "loss": 0.84540606, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.36499023, + "step": 1117, + "time_per_iteration": 2.723092794418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_mlp": 1.05805993, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.055087901571477416, + "language_loss": 0.84983969, + "learning_rate": 0.0009128350373224665, + "loss": 0.8607831, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.36352539, + "step": 1118, + "time_per_iteration": 2.5449509620666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178954, + "balance_loss_mlp": 1.14500344, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.021865185871831474, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82635385, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.33984375, + "step": 1119, + "time_per_iteration": 4.641271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_mlp": 1.06648207, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.07523243301623007, + "language_loss": 0.85678464, + "learning_rate": 0.0009124832016254005, + "loss": 0.86781639, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.36694336, + "step": 1120, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_mlp": 1.06163859, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.07092227494936269, + "language_loss": 0.87677884, + "learning_rate": 0.0009123070435324316, + "loss": 0.88775837, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.36352539, + "step": 1121, + "time_per_iteration": 2.777632236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166186, + "balance_loss_mlp": 1.13337982, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.01899876446696313, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.7904197, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.328125, + "step": 1122, + "time_per_iteration": 4.966520547866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.0522635, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.060329223802114536, + "language_loss": 0.86415493, + "learning_rate": 0.0009119542471995752, + "loss": 0.87504709, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.36938477, + "step": 1123, + "time_per_iteration": 2.8373889923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090311, + "balance_loss_mlp": 1.05438125, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06176848453484022, + "language_loss": 0.81323773, + "learning_rate": 0.0009117776090966554, + "loss": 0.82414079, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.359375, + "step": 1124, + "time_per_iteration": 2.999127149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_mlp": 1.0507102, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.07470238986110685, + "language_loss": 0.86757743, + "learning_rate": 0.0009116008111274899, + "loss": 0.87845105, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.36669922, + "step": 1125, + "time_per_iteration": 3.3534371852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160744, + "balance_loss_mlp": 1.13022673, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.021433456679081614, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80267668, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.3046875, + "step": 1126, + "time_per_iteration": 4.8522608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086571, + "balance_loss_mlp": 1.04975939, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.07895568764354688, + "language_loss": 0.85050654, + "learning_rate": 0.0009112467358650396, + "loss": 0.86137229, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.36816406, + "step": 1127, + "time_per_iteration": 3.157684803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05472374, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.05660039583272807, + "language_loss": 0.86175025, + "learning_rate": 0.0009110694587092192, + "loss": 0.87265825, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.36108398, + "step": 1128, + "time_per_iteration": 2.755575656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.052562, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.077592311143443, + "language_loss": 0.81304091, + "learning_rate": 0.0009108920219620815, + "loss": 0.82392299, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35693359, + "step": 1129, + "time_per_iteration": 2.639261484146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_mlp": 1.05548096, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.06998872933736075, + "language_loss": 0.8949976, + "learning_rate": 0.0009107144256925133, + "loss": 0.90590858, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35620117, + "step": 1130, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096157, + "balance_loss_mlp": 1.0606091, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.08228743876345572, + "language_loss": 0.81527102, + "learning_rate": 0.0009105366699694638, + "loss": 0.82623267, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.35546875, + "step": 1131, + "time_per_iteration": 2.726532220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087405, + "balance_loss_mlp": 1.0526911, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.05363867293402688, + "language_loss": 0.81731898, + "learning_rate": 0.0009103587548619439, + "loss": 0.82819301, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.34741211, + "step": 1132, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.05978799, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.0659512575968049, + "language_loss": 0.85836411, + "learning_rate": 0.0009101806804390261, + "loss": 0.8693251, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.36328125, + "step": 1133, + "time_per_iteration": 2.789860725402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093043, + "balance_loss_mlp": 1.056494, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.06887538910693401, + "language_loss": 0.90261114, + "learning_rate": 0.0009100024467698453, + "loss": 0.91354156, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.3659668, + "step": 1134, + "time_per_iteration": 2.6074166297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.05786586, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07516267041517319, + "language_loss": 0.82424915, + "learning_rate": 0.0009098240539235981, + "loss": 0.83520383, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.37573242, + "step": 1135, + "time_per_iteration": 2.6695401668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095721, + "balance_loss_mlp": 1.05809808, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.07818229339121877, + "language_loss": 0.87811279, + "learning_rate": 0.0009096455019695423, + "loss": 0.88907003, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.3762207, + "step": 1136, + "time_per_iteration": 4.259606838226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.05180001, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.07138569527580692, + "language_loss": 0.89539087, + "learning_rate": 0.000909466790976998, + "loss": 0.90628058, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.37182617, + "step": 1137, + "time_per_iteration": 2.4586610794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086709, + "balance_loss_mlp": 1.0483948, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.07428895088203294, + "language_loss": 0.82083362, + "learning_rate": 0.0009092879210153473, + "loss": 0.83170068, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.38305664, + "step": 1138, + "time_per_iteration": 3.097928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_mlp": 1.04944801, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.07001266476470332, + "language_loss": 0.88581419, + "learning_rate": 0.0009091088921540333, + "loss": 0.89668703, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.37817383, + "step": 1139, + "time_per_iteration": 2.5904369354248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138075, + "balance_loss_mlp": 1.11270714, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.032290681216211516, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76646751, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.25390625, + "step": 1140, + "time_per_iteration": 4.913591623306274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_mlp": 1.05353999, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.1397659602768512, + "language_loss": 0.84288347, + "learning_rate": 0.0009087503580104985, + "loss": 0.85378748, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.36865234, + "step": 1141, + "time_per_iteration": 2.6825575828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_mlp": 1.06602514, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0722566511462073, + "language_loss": 0.79141879, + "learning_rate": 0.0009085708528674728, + "loss": 0.80245048, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.37133789, + "step": 1142, + "time_per_iteration": 2.8078551292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.06551528, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.06720954872782575, + "language_loss": 0.8638975, + "learning_rate": 0.0009083911891031745, + "loss": 0.87494051, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.38793945, + "step": 1143, + "time_per_iteration": 3.1356892585754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.07328963, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.08162422903338651, + "language_loss": 0.91253042, + "learning_rate": 0.0009082113667873553, + "loss": 0.92363143, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3684082, + "step": 1144, + "time_per_iteration": 3.1446304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112165, + "balance_loss_mlp": 1.07387483, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.0676762249982335, + "language_loss": 0.90471655, + "learning_rate": 0.0009080313859898283, + "loss": 0.91583818, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.38256836, + "step": 1145, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110814, + "balance_loss_mlp": 1.07082736, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.13336101787368373, + "language_loss": 0.91929018, + "learning_rate": 0.0009078512467804684, + "loss": 0.93037164, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.37304688, + "step": 1146, + "time_per_iteration": 2.6156158447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105973, + "balance_loss_mlp": 1.06882787, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06165136945539885, + "language_loss": 0.89993024, + "learning_rate": 0.0009076709492292119, + "loss": 0.91098994, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.37133789, + "step": 1147, + "time_per_iteration": 2.617534875869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095299, + "balance_loss_mlp": 1.06032324, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.11177878536303132, + "language_loss": 0.88637269, + "learning_rate": 0.0009074904934060562, + "loss": 0.89732569, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34985352, + "step": 1148, + "time_per_iteration": 2.6782190799713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086783, + "balance_loss_mlp": 1.05237889, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.0637571078176039, + "language_loss": 0.84905714, + "learning_rate": 0.0009073098793810607, + "loss": 0.85992491, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.34423828, + "step": 1149, + "time_per_iteration": 2.956638813018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085311, + "balance_loss_mlp": 1.04969168, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07731387173425769, + "language_loss": 0.8803097, + "learning_rate": 0.000907129107224346, + "loss": 0.89116287, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35595703, + "step": 1150, + "time_per_iteration": 2.724456548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04623771, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.0527541061714234, + "language_loss": 0.88156152, + "learning_rate": 0.0009069481770060939, + "loss": 0.89237529, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35180664, + "step": 1151, + "time_per_iteration": 2.6539950370788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.04811299, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.06610336138884995, + "language_loss": 0.83768857, + "learning_rate": 0.000906767088796548, + "loss": 0.84853232, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.36279297, + "step": 1152, + "time_per_iteration": 3.4304041862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.05147004, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.06692160227790218, + "language_loss": 0.87012255, + "learning_rate": 0.0009065858426660127, + "loss": 0.88099682, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.35986328, + "step": 1153, + "time_per_iteration": 2.639326333999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_mlp": 1.05480099, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.07963844060104928, + "language_loss": 0.84658396, + "learning_rate": 0.0009064044386848543, + "loss": 0.85748196, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.3503418, + "step": 1154, + "time_per_iteration": 2.904387950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_mlp": 1.05992007, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.07985092329826342, + "language_loss": 0.88786525, + "learning_rate": 0.0009062228769234997, + "loss": 0.89881229, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.34838867, + "step": 1155, + "time_per_iteration": 2.547041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095087, + "balance_loss_mlp": 1.05977738, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.067267193175655, + "language_loss": 0.80872244, + "learning_rate": 0.0009060411574524376, + "loss": 0.81967336, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35327148, + "step": 1156, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_mlp": 1.06561852, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.07018019580992392, + "language_loss": 0.87947989, + "learning_rate": 0.0009058592803422178, + "loss": 0.8904835, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34765625, + "step": 1157, + "time_per_iteration": 3.161827564239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_mlp": 1.05688405, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.0269537140509509, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79798073, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.30859375, + "step": 1158, + "time_per_iteration": 4.827271223068237 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_mlp": 1.06608617, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.10870396219255896, + "language_loss": 0.89957273, + "learning_rate": 0.00090549505348681, + "loss": 0.91057909, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.34594727, + "step": 1159, + "time_per_iteration": 2.5724213123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_mlp": 1.08144796, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.06607938149323832, + "language_loss": 0.83976638, + "learning_rate": 0.0009053127038830275, + "loss": 0.85092539, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.3449707, + "step": 1160, + "time_per_iteration": 2.979442834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108838, + "balance_loss_mlp": 1.07538772, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.07010640296313479, + "language_loss": 0.86946774, + "learning_rate": 0.000905130196922898, + "loss": 0.88055611, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3347168, + "step": 1161, + "time_per_iteration": 2.582780361175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_mlp": 1.0797379, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.056850955952103474, + "language_loss": 0.86954904, + "learning_rate": 0.0009049475326772769, + "loss": 0.88069069, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.34472656, + "step": 1162, + "time_per_iteration": 2.572434902191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116085, + "balance_loss_mlp": 1.08270645, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.07142312953148652, + "language_loss": 0.82233834, + "learning_rate": 0.0009047647112170811, + "loss": 0.83349919, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.33398438, + "step": 1163, + "time_per_iteration": 2.7467033863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_mlp": 1.07115388, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.07009650422776509, + "language_loss": 0.87291974, + "learning_rate": 0.0009045817326132876, + "loss": 0.88396937, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.33837891, + "step": 1164, + "time_per_iteration": 3.6699986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096597, + "balance_loss_mlp": 1.06150198, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.07687995911666942, + "language_loss": 0.8312459, + "learning_rate": 0.0009043985969369357, + "loss": 0.84221184, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35131836, + "step": 1165, + "time_per_iteration": 2.8716225624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_mlp": 1.06461644, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.062241931717823204, + "language_loss": 0.84419966, + "learning_rate": 0.0009042153042591245, + "loss": 0.85519511, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.34960938, + "step": 1166, + "time_per_iteration": 2.8038439750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094194, + "balance_loss_mlp": 1.05971861, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.05754676867835885, + "language_loss": 0.85229421, + "learning_rate": 0.0009040318546510146, + "loss": 0.86323619, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.3449707, + "step": 1167, + "time_per_iteration": 3.166391372680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_mlp": 1.06672144, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06328547350255756, + "language_loss": 0.84822267, + "learning_rate": 0.0009038482481838275, + "loss": 0.85923845, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.34887695, + "step": 1168, + "time_per_iteration": 2.6582534313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092575, + "balance_loss_mlp": 1.05726552, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05398415615287821, + "language_loss": 0.8685748, + "learning_rate": 0.0009036644849288455, + "loss": 0.87950051, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35327148, + "step": 1169, + "time_per_iteration": 3.131391763687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_mlp": 1.06735337, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06156740204868492, + "language_loss": 0.85189641, + "learning_rate": 0.0009034805649574118, + "loss": 0.86291689, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.34716797, + "step": 1170, + "time_per_iteration": 2.662177801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093313, + "balance_loss_mlp": 1.05991113, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.07489985201842045, + "language_loss": 0.85256809, + "learning_rate": 0.0009032964883409308, + "loss": 0.86350119, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.33422852, + "step": 1171, + "time_per_iteration": 2.872305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_mlp": 0.9971894, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.01784679187957182, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74073857, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.26171875, + "step": 1172, + "time_per_iteration": 4.968618154525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090705, + "balance_loss_mlp": 1.05649197, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.05674331384718379, + "language_loss": 0.87210125, + "learning_rate": 0.0009029278654587462, + "loss": 0.88300836, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.3425293, + "step": 1173, + "time_per_iteration": 2.5812408924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05043077, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06970392839419266, + "language_loss": 0.82089472, + "learning_rate": 0.0009027433193361548, + "loss": 0.83174634, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.34765625, + "step": 1174, + "time_per_iteration": 2.7284860610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_mlp": 1.0550499, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.05615396633220104, + "language_loss": 0.86867499, + "learning_rate": 0.00090255861685474, + "loss": 0.87957788, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.3527832, + "step": 1175, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_mlp": 1.05040812, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06159717434172949, + "language_loss": 0.91109395, + "learning_rate": 0.0009023737580862095, + "loss": 0.92195278, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.35473633, + "step": 1176, + "time_per_iteration": 2.5320050716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089039, + "balance_loss_mlp": 1.05468273, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05820331342721636, + "language_loss": 0.82901466, + "learning_rate": 0.0009021887431023321, + "loss": 0.83990508, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34399414, + "step": 1177, + "time_per_iteration": 2.619271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094278, + "balance_loss_mlp": 1.05939722, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05650773027793175, + "language_loss": 0.86773884, + "learning_rate": 0.0009020035719749369, + "loss": 0.8786816, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.34912109, + "step": 1178, + "time_per_iteration": 2.7209300994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_mlp": 1.05536032, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.07505314575513819, + "language_loss": 0.77450001, + "learning_rate": 0.0009018182447759136, + "loss": 0.78538495, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.33154297, + "step": 1179, + "time_per_iteration": 2.957627534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.05793107, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0724719412784609, + "language_loss": 0.79327267, + "learning_rate": 0.0009016327615772126, + "loss": 0.80419827, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.34619141, + "step": 1180, + "time_per_iteration": 2.9636237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098683, + "balance_loss_mlp": 1.06425512, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06868963719018656, + "language_loss": 0.87725425, + "learning_rate": 0.0009014471224508451, + "loss": 0.88824105, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34448242, + "step": 1181, + "time_per_iteration": 2.6756978034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101065, + "balance_loss_mlp": 1.06725717, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.08625014316755293, + "language_loss": 0.8279528, + "learning_rate": 0.0009012613274688823, + "loss": 0.83896345, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.33837891, + "step": 1182, + "time_per_iteration": 2.679690361022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106597, + "balance_loss_mlp": 1.0716213, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.07160666852762332, + "language_loss": 0.87420428, + "learning_rate": 0.0009010753767034565, + "loss": 0.8852703, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35009766, + "step": 1183, + "time_per_iteration": 2.56422758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110957, + "balance_loss_mlp": 1.07514668, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07593119142071596, + "language_loss": 0.7905606, + "learning_rate": 0.0009008892702267599, + "loss": 0.80167019, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.35839844, + "step": 1184, + "time_per_iteration": 2.96954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138099, + "balance_loss_mlp": 1.10255075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.08993468677273868, + "language_loss": 0.88719535, + "learning_rate": 0.0009007030081110457, + "loss": 0.89857626, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35571289, + "step": 1185, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.08923352, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.08461110053036625, + "language_loss": 0.84618473, + "learning_rate": 0.000900516590428627, + "loss": 0.85743326, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35668945, + "step": 1186, + "time_per_iteration": 2.6506764888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_mlp": 1.08637488, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.07299458038970587, + "language_loss": 0.89267749, + "learning_rate": 0.0009003300172518778, + "loss": 0.90388483, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34399414, + "step": 1187, + "time_per_iteration": 2.6919267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_mlp": 1.07291603, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.06786881834878318, + "language_loss": 0.83963048, + "learning_rate": 0.0009001432886532321, + "loss": 0.85070467, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.34521484, + "step": 1188, + "time_per_iteration": 2.9668681621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103209, + "balance_loss_mlp": 1.07002091, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06096375157572686, + "language_loss": 0.86560941, + "learning_rate": 0.0008999564047051843, + "loss": 0.87664151, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.33203125, + "step": 1189, + "time_per_iteration": 2.520157814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_mlp": 1.07070816, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.07257222459915597, + "language_loss": 0.84934878, + "learning_rate": 0.0008997693654802894, + "loss": 0.86038733, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.33154297, + "step": 1190, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117207, + "balance_loss_mlp": 1.08375657, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.056681488577390256, + "language_loss": 0.86392069, + "learning_rate": 0.0008995821710511625, + "loss": 0.87509274, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.3347168, + "step": 1191, + "time_per_iteration": 2.727444887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.08369398, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06323137320540088, + "language_loss": 0.85004956, + "learning_rate": 0.0008993948214904786, + "loss": 0.86121625, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.32983398, + "step": 1192, + "time_per_iteration": 2.5774295330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_mlp": 1.06097257, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.030992800338245956, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79508746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.25585938, + "step": 1193, + "time_per_iteration": 4.854384422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.08934152, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06852039575110529, + "language_loss": 0.7808823, + "learning_rate": 0.0008990196572654427, + "loss": 0.79210448, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.32861328, + "step": 1194, + "time_per_iteration": 2.873081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112553, + "balance_loss_mlp": 1.07943714, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.05701230798072306, + "language_loss": 0.87415946, + "learning_rate": 0.0008988318427467426, + "loss": 0.88528502, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.33105469, + "step": 1195, + "time_per_iteration": 2.702685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.06522477, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06940657308766013, + "language_loss": 0.85968834, + "learning_rate": 0.0008986438733877887, + "loss": 0.87066793, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.32739258, + "step": 1196, + "time_per_iteration": 3.4571969509124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096888, + "balance_loss_mlp": 1.06482017, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04726997036122248, + "language_loss": 0.83756924, + "learning_rate": 0.0008984557492615576, + "loss": 0.8485381, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.32055664, + "step": 1197, + "time_per_iteration": 2.9306819438934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090156, + "balance_loss_mlp": 1.05718327, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.05994921168989351, + "language_loss": 0.89349306, + "learning_rate": 0.0008982674704410854, + "loss": 0.90439463, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.32983398, + "step": 1198, + "time_per_iteration": 2.706496238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089604, + "balance_loss_mlp": 1.05648804, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06548245075345789, + "language_loss": 0.7739616, + "learning_rate": 0.0008980790369994682, + "loss": 0.78485769, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.33129883, + "step": 1199, + "time_per_iteration": 2.962169647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109754, + "balance_loss_mlp": 1.06375623, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.06722903582933262, + "language_loss": 0.86851013, + "learning_rate": 0.000897890449009863, + "loss": 0.87948549, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.33813477, + "step": 1200, + "time_per_iteration": 2.6820433139801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092921, + "balance_loss_mlp": 1.05877972, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.051980143810921, + "language_loss": 0.89933294, + "learning_rate": 0.0008977017065454853, + "loss": 0.91026211, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.34179688, + "step": 1201, + "time_per_iteration": 2.6699435710906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_mlp": 1.0640595, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.0699249838794834, + "language_loss": 0.80333388, + "learning_rate": 0.0008975128096796121, + "loss": 0.81432372, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34936523, + "step": 1202, + "time_per_iteration": 2.891552448272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0627346, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.08096245126913681, + "language_loss": 0.85447264, + "learning_rate": 0.0008973237584855794, + "loss": 0.86543471, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.33496094, + "step": 1203, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.06007552, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.07003086272099243, + "language_loss": 0.82261837, + "learning_rate": 0.0008971345530367832, + "loss": 0.83355689, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.33789062, + "step": 1204, + "time_per_iteration": 2.4648683071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_mlp": 1.05619669, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.0706025487590865, + "language_loss": 0.84670615, + "learning_rate": 0.0008969451934066799, + "loss": 0.85760665, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.33862305, + "step": 1205, + "time_per_iteration": 2.7628865242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096032, + "balance_loss_mlp": 1.06274843, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.07866862210425928, + "language_loss": 0.79702371, + "learning_rate": 0.0008967556796687854, + "loss": 0.80798399, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.33276367, + "step": 1206, + "time_per_iteration": 2.8876569271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099743, + "balance_loss_mlp": 1.06746101, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05955020850576899, + "language_loss": 0.83383894, + "learning_rate": 0.0008965660118966752, + "loss": 0.84483635, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.32275391, + "step": 1207, + "time_per_iteration": 2.8915722370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.06087792, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.05733195861059391, + "language_loss": 0.89860612, + "learning_rate": 0.0008963761901639851, + "loss": 0.90953553, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.32055664, + "step": 1208, + "time_per_iteration": 2.839872121810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_mlp": 1.06843603, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.0677808606719883, + "language_loss": 0.83122128, + "learning_rate": 0.0008961862145444103, + "loss": 0.84222686, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.32104492, + "step": 1209, + "time_per_iteration": 2.723395824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_mlp": 1.07726288, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06757554355714504, + "language_loss": 0.8539983, + "learning_rate": 0.0008959960851117059, + "loss": 0.86509824, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.32739258, + "step": 1210, + "time_per_iteration": 2.5843160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.08055305, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.06719057665627333, + "language_loss": 0.83744979, + "learning_rate": 0.0008958058019396868, + "loss": 0.84857744, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.32202148, + "step": 1211, + "time_per_iteration": 2.790137529373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_mlp": 1.07865953, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.061561154104104274, + "language_loss": 0.86634141, + "learning_rate": 0.0008956153651022274, + "loss": 0.877446, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.31787109, + "step": 1212, + "time_per_iteration": 2.6943769454956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107151, + "balance_loss_mlp": 1.07506013, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.056352889191353187, + "language_loss": 0.84060359, + "learning_rate": 0.0008954247746732618, + "loss": 0.85167515, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.32080078, + "step": 1213, + "time_per_iteration": 2.635540723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.07504261, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.059598265922157306, + "language_loss": 0.90450746, + "learning_rate": 0.0008952340307267837, + "loss": 0.91556644, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.30810547, + "step": 1214, + "time_per_iteration": 2.8842196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098908, + "balance_loss_mlp": 1.06817579, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.059513387141436946, + "language_loss": 0.83485198, + "learning_rate": 0.0008950431333368468, + "loss": 0.84584105, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.30688477, + "step": 1215, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098575, + "balance_loss_mlp": 1.06662679, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.05495395288746111, + "language_loss": 0.84313607, + "learning_rate": 0.0008948520825775634, + "loss": 0.85412186, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.31933594, + "step": 1216, + "time_per_iteration": 3.6454994678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099032, + "balance_loss_mlp": 1.06782317, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06066187191945671, + "language_loss": 0.83935732, + "learning_rate": 0.0008946608785231067, + "loss": 0.85034764, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.31176758, + "step": 1217, + "time_per_iteration": 2.9157872200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098088, + "balance_loss_mlp": 1.06599677, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.058216777953853424, + "language_loss": 0.84654021, + "learning_rate": 0.0008944695212477084, + "loss": 0.85752106, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.32080078, + "step": 1218, + "time_per_iteration": 2.473067045211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_mlp": 1.07158232, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.06075167680795146, + "language_loss": 0.86133409, + "learning_rate": 0.0008942780108256599, + "loss": 0.87237012, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.32006836, + "step": 1219, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_mlp": 1.06819737, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.07971641299609675, + "language_loss": 0.86269408, + "learning_rate": 0.0008940863473313121, + "loss": 0.87370056, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.32446289, + "step": 1220, + "time_per_iteration": 2.453798532485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_mlp": 1.0764761, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.07248436265958902, + "language_loss": 0.87226778, + "learning_rate": 0.0008938945308390756, + "loss": 0.88335222, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.31958008, + "step": 1221, + "time_per_iteration": 2.6299164295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092799, + "balance_loss_mlp": 1.06099391, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.0746326386118845, + "language_loss": 0.86801684, + "learning_rate": 0.00089370256142342, + "loss": 0.87894481, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.31787109, + "step": 1222, + "time_per_iteration": 2.7373716831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_mlp": 1.0675782, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.06792905088784162, + "language_loss": 0.84961808, + "learning_rate": 0.0008935104391588746, + "loss": 0.86061692, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.32299805, + "step": 1223, + "time_per_iteration": 2.786801338195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.06850326, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.053660170998325075, + "language_loss": 0.8281433, + "learning_rate": 0.0008933181641200276, + "loss": 0.83915687, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.32861328, + "step": 1224, + "time_per_iteration": 3.1502432823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102432, + "balance_loss_mlp": 1.06948209, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06465671729424353, + "language_loss": 0.85675979, + "learning_rate": 0.0008931257363815271, + "loss": 0.86778408, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.32958984, + "step": 1225, + "time_per_iteration": 2.9370880126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110561, + "balance_loss_mlp": 1.07370961, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.07282820073226746, + "language_loss": 0.89753437, + "learning_rate": 0.0008929331560180798, + "loss": 0.9085905, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.31884766, + "step": 1226, + "time_per_iteration": 2.977869749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122954, + "balance_loss_mlp": 1.09045768, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.053569811561680475, + "language_loss": 0.90818799, + "learning_rate": 0.0008927404231044525, + "loss": 0.91941756, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.32495117, + "step": 1227, + "time_per_iteration": 2.683979034423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111641, + "balance_loss_mlp": 1.07909656, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.06109587035495086, + "language_loss": 0.81612283, + "learning_rate": 0.0008925475377154703, + "loss": 0.82723922, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.32543945, + "step": 1228, + "time_per_iteration": 2.734614610671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119771, + "balance_loss_mlp": 1.08577275, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.06451716518904643, + "language_loss": 0.82344091, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463866, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.34033203, + "step": 1229, + "time_per_iteration": 2.740309000015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_mlp": 1.07561386, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.0665465772726836, + "language_loss": 0.91460836, + "learning_rate": 0.00089216130981104, + "loss": 0.92569423, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.32983398, + "step": 1230, + "time_per_iteration": 3.1343088150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_mlp": 1.07120848, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.061759964990198334, + "language_loss": 0.81970417, + "learning_rate": 0.000891967967445539, + "loss": 0.83074409, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.32788086, + "step": 1231, + "time_per_iteration": 2.67669677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100144, + "balance_loss_mlp": 1.06829166, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04660382532121484, + "language_loss": 0.88927996, + "learning_rate": 0.0008917744729045772, + "loss": 0.90028143, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.31835938, + "step": 1232, + "time_per_iteration": 2.87488055229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098328, + "balance_loss_mlp": 1.06695223, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.054845027384176535, + "language_loss": 0.83439517, + "learning_rate": 0.0008915808262632757, + "loss": 0.84537846, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.31347656, + "step": 1233, + "time_per_iteration": 2.884615659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_mlp": 1.0800519, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.058607558308664987, + "language_loss": 0.93242431, + "learning_rate": 0.0008913870275968148, + "loss": 0.94353569, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.31054688, + "step": 1234, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.07740974, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.0661901036623414, + "language_loss": 0.87537754, + "learning_rate": 0.0008911930769804342, + "loss": 0.88646448, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.3125, + "step": 1235, + "time_per_iteration": 3.247985363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_mlp": 1.08396649, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.053926277509791044, + "language_loss": 0.90842855, + "learning_rate": 0.0008909989744894318, + "loss": 0.91957957, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.31103516, + "step": 1236, + "time_per_iteration": 2.8457424640655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116546, + "balance_loss_mlp": 1.08598089, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.07410834458794652, + "language_loss": 0.81166267, + "learning_rate": 0.0008908047201991649, + "loss": 0.82282805, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.30517578, + "step": 1237, + "time_per_iteration": 2.743232011795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_mlp": 1.07218719, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.0897055957170317, + "language_loss": 0.8615526, + "learning_rate": 0.0008906103141850502, + "loss": 0.87258613, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.3112793, + "step": 1238, + "time_per_iteration": 2.8931751251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_mlp": 1.07164085, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.0595559706342315, + "language_loss": 0.87583494, + "learning_rate": 0.0008904157565225621, + "loss": 0.88686728, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.31567383, + "step": 1239, + "time_per_iteration": 2.681567430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096601, + "balance_loss_mlp": 1.06546402, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07926394914951292, + "language_loss": 0.81636947, + "learning_rate": 0.000890221047287235, + "loss": 0.82733548, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.31103516, + "step": 1240, + "time_per_iteration": 3.5042829513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096214, + "balance_loss_mlp": 1.06450391, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.06383986480013222, + "language_loss": 0.90398014, + "learning_rate": 0.0008900261865546615, + "loss": 0.91494226, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.31689453, + "step": 1241, + "time_per_iteration": 2.656243324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.06533027, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.07463092576288201, + "language_loss": 0.84907639, + "learning_rate": 0.0008898311744004936, + "loss": 0.86005968, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.33007812, + "step": 1242, + "time_per_iteration": 2.7337045669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.05583906, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.057670085451747476, + "language_loss": 0.86718595, + "learning_rate": 0.0008896360109004414, + "loss": 0.87808001, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.3359375, + "step": 1243, + "time_per_iteration": 2.6334750652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090579, + "balance_loss_mlp": 1.05667567, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.055695642571784755, + "language_loss": 0.84363699, + "learning_rate": 0.0008894406961302742, + "loss": 0.85454273, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.33935547, + "step": 1244, + "time_per_iteration": 2.612278699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092282, + "balance_loss_mlp": 1.05840266, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.053835846346086756, + "language_loss": 0.83682489, + "learning_rate": 0.0008892452301658201, + "loss": 0.84774774, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.33911133, + "step": 1245, + "time_per_iteration": 2.999476432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_mlp": 1.06169045, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.07830491582761978, + "language_loss": 0.83242297, + "learning_rate": 0.0008890496130829653, + "loss": 0.84337801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.33837891, + "step": 1246, + "time_per_iteration": 2.6750991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093391, + "balance_loss_mlp": 1.05913019, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.06104300334873528, + "language_loss": 0.85340333, + "learning_rate": 0.0008888538449576555, + "loss": 0.86433721, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.34301758, + "step": 1247, + "time_per_iteration": 2.5646800994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095388, + "balance_loss_mlp": 1.06131816, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.05789610317969602, + "language_loss": 0.82348001, + "learning_rate": 0.0008886579258658944, + "loss": 0.83443391, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.34082031, + "step": 1248, + "time_per_iteration": 2.562016487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.05283499, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.05381401206887855, + "language_loss": 0.84731787, + "learning_rate": 0.0008884618558837446, + "loss": 0.85818857, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.34277344, + "step": 1249, + "time_per_iteration": 2.8163750171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093014, + "balance_loss_mlp": 1.05927801, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.06053052424994898, + "language_loss": 0.86413568, + "learning_rate": 0.0008882656350873273, + "loss": 0.8750658, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.33764648, + "step": 1250, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088368, + "balance_loss_mlp": 1.05546594, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.06849099956300345, + "language_loss": 0.87088066, + "learning_rate": 0.0008880692635528219, + "loss": 0.88176429, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.32910156, + "step": 1251, + "time_per_iteration": 3.0528526306152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.048823, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.06290905233547327, + "language_loss": 0.88876319, + "learning_rate": 0.0008878727413564669, + "loss": 0.89957213, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.32055664, + "step": 1252, + "time_per_iteration": 2.758507251739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.05194211, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.04466256972049361, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81213295, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.2578125, + "step": 1253, + "time_per_iteration": 4.847649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05616474, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.059681429897919615, + "language_loss": 0.78408957, + "learning_rate": 0.0008874792452834528, + "loss": 0.79497254, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.32128906, + "step": 1254, + "time_per_iteration": 2.754746198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06061172, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.07362958371245172, + "language_loss": 0.87187612, + "learning_rate": 0.0008872822715595626, + "loss": 0.88279426, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.31176758, + "step": 1255, + "time_per_iteration": 2.662929058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_mlp": 1.06200314, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.08064600620778418, + "language_loss": 0.86789644, + "learning_rate": 0.0008870851474793598, + "loss": 0.87882906, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.31225586, + "step": 1256, + "time_per_iteration": 2.550830841064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06434524, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.05836545436632832, + "language_loss": 0.89218223, + "learning_rate": 0.0008868878731193752, + "loss": 0.90314561, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.31982422, + "step": 1257, + "time_per_iteration": 2.850184440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095001, + "balance_loss_mlp": 1.06400657, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.05536217997614851, + "language_loss": 0.89056414, + "learning_rate": 0.0008866904485561973, + "loss": 0.90151417, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.30957031, + "step": 1258, + "time_per_iteration": 2.7176461219787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107248, + "balance_loss_mlp": 1.0765636, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.0620425495695956, + "language_loss": 0.82697642, + "learning_rate": 0.000886492873866473, + "loss": 0.83804893, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.30639648, + "step": 1259, + "time_per_iteration": 2.881246328353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_mlp": 1.07631803, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.0764912621319216, + "language_loss": 0.84458697, + "learning_rate": 0.000886295149126908, + "loss": 0.85565412, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.3034668, + "step": 1260, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_mlp": 1.07148254, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05050860424869067, + "language_loss": 0.85437667, + "learning_rate": 0.0008860972744142655, + "loss": 0.86539763, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.30566406, + "step": 1261, + "time_per_iteration": 2.924192190170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_mlp": 1.07146263, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.05198228858732316, + "language_loss": 0.81767958, + "learning_rate": 0.0008858992498053671, + "loss": 0.82869458, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.30004883, + "step": 1262, + "time_per_iteration": 2.8300395011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069733, + "balance_loss_mlp": 1.04455626, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.04093384265265131, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77658486, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.25195312, + "step": 1263, + "time_per_iteration": 4.837641716003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_mlp": 1.07217157, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05948216339756903, + "language_loss": 0.83247912, + "learning_rate": 0.0008855027512063817, + "loss": 0.84351087, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.30957031, + "step": 1264, + "time_per_iteration": 2.7277276515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102812, + "balance_loss_mlp": 1.07191277, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06194442365761257, + "language_loss": 0.8589493, + "learning_rate": 0.0008853042773702292, + "loss": 0.86997747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.30859375, + "step": 1265, + "time_per_iteration": 2.7305567264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_mlp": 1.07197642, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.0568893751116151, + "language_loss": 0.87145638, + "learning_rate": 0.0008851056539456896, + "loss": 0.88248914, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.31274414, + "step": 1266, + "time_per_iteration": 2.6886072158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.06767774, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.06669847345827673, + "language_loss": 0.81623918, + "learning_rate": 0.0008849068810098755, + "loss": 0.82723451, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.31835938, + "step": 1267, + "time_per_iteration": 3.302135705947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_mlp": 1.06049967, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.06302829877877653, + "language_loss": 0.82764143, + "learning_rate": 0.0008847079586399575, + "loss": 0.83856159, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.31494141, + "step": 1268, + "time_per_iteration": 2.469602584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.05755162, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.062034835544456234, + "language_loss": 0.85665154, + "learning_rate": 0.0008845088869131641, + "loss": 0.86753917, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.31176758, + "step": 1269, + "time_per_iteration": 2.6822941303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090407, + "balance_loss_mlp": 1.05864954, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.06778965234687388, + "language_loss": 0.88905638, + "learning_rate": 0.0008843096659067818, + "loss": 0.8999604, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.31738281, + "step": 1270, + "time_per_iteration": 2.594064235687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05697237066827103, + "language_loss": 0.85987377, + "learning_rate": 0.000884110295698155, + "loss": 0.87074518, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.31567383, + "step": 1271, + "time_per_iteration": 2.974696636199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.0512805, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.06068289501227115, + "language_loss": 0.85902673, + "learning_rate": 0.0008839107763646861, + "loss": 0.86986518, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.32568359, + "step": 1272, + "time_per_iteration": 2.607771158218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_mlp": 1.0507555, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.061464799303267155, + "language_loss": 0.9008882, + "learning_rate": 0.0008837111079838353, + "loss": 0.91174459, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.34912109, + "step": 1273, + "time_per_iteration": 2.708512306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0463264, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.06335862765515422, + "language_loss": 0.89847112, + "learning_rate": 0.000883511290633121, + "loss": 0.9092629, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.32861328, + "step": 1274, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.04423904, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04937694398035677, + "language_loss": 0.92408085, + "learning_rate": 0.000883311324390119, + "loss": 0.93485993, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.33691406, + "step": 1275, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.0457077, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.07292672859625873, + "language_loss": 0.80929816, + "learning_rate": 0.0008831112093324629, + "loss": 0.82010162, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.34667969, + "step": 1276, + "time_per_iteration": 3.0507287979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.04209912, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0707858001482728, + "language_loss": 0.88982868, + "learning_rate": 0.0008829109455378444, + "loss": 0.90059322, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.34375, + "step": 1277, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.04284549, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05561589900472309, + "language_loss": 0.86233819, + "learning_rate": 0.000882710533084013, + "loss": 0.87310779, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.34155273, + "step": 1278, + "time_per_iteration": 2.623353958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074564, + "balance_loss_mlp": 1.04013681, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04936271772538766, + "language_loss": 0.89139968, + "learning_rate": 0.0008825099720487755, + "loss": 0.90214527, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.34448242, + "step": 1279, + "time_per_iteration": 2.6549813747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069233, + "balance_loss_mlp": 1.04853857, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.028817901818472227, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76330376, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.20703125, + "step": 1280, + "time_per_iteration": 4.85357141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_mlp": 1.04521215, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.026145975527968417, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79010111, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.20800781, + "step": 1281, + "time_per_iteration": 4.780989408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_mlp": 1.04983163, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.06975718656823436, + "language_loss": 0.89050984, + "learning_rate": 0.0008819073982335619, + "loss": 0.90134096, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.33300781, + "step": 1282, + "time_per_iteration": 2.8345205783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05361331, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.062337694406813374, + "language_loss": 0.84269708, + "learning_rate": 0.0008817062436519235, + "loss": 0.85355437, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.32104492, + "step": 1283, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089504, + "balance_loss_mlp": 1.05612516, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.06365108043104846, + "language_loss": 0.89943874, + "learning_rate": 0.0008815049408787788, + "loss": 0.91033375, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.33398438, + "step": 1284, + "time_per_iteration": 2.5116872787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.04916823, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.059551230096427064, + "language_loss": 0.85302055, + "learning_rate": 0.0008813034899922805, + "loss": 0.86383736, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.32519531, + "step": 1285, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080955, + "balance_loss_mlp": 1.04931688, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06660544793665324, + "language_loss": 0.89506048, + "learning_rate": 0.0008811018910706387, + "loss": 0.90586996, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.31616211, + "step": 1286, + "time_per_iteration": 2.552616834640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_mlp": 1.04756403, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.07038813341767636, + "language_loss": 0.81879961, + "learning_rate": 0.0008809001441921211, + "loss": 0.82959306, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.31762695, + "step": 1287, + "time_per_iteration": 2.704249143600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082412, + "balance_loss_mlp": 1.05132163, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.054805193397824324, + "language_loss": 0.85345185, + "learning_rate": 0.0008806982494350528, + "loss": 0.86427593, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.31054688, + "step": 1288, + "time_per_iteration": 2.65993070602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.05359983, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.05430799794632807, + "language_loss": 0.90285796, + "learning_rate": 0.0008804962068778161, + "loss": 0.91370773, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.31347656, + "step": 1289, + "time_per_iteration": 2.8633711338043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_mlp": 1.05515075, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.06485439157304855, + "language_loss": 0.81069577, + "learning_rate": 0.0008802940165988511, + "loss": 0.82155788, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.31030273, + "step": 1290, + "time_per_iteration": 2.877063274383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_mlp": 1.05341625, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.058113292585204916, + "language_loss": 0.88358063, + "learning_rate": 0.000880091678676655, + "loss": 0.89442384, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.30859375, + "step": 1291, + "time_per_iteration": 2.800182342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088307, + "balance_loss_mlp": 1.05814719, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.05744202885681841, + "language_loss": 0.88709044, + "learning_rate": 0.0008798891931897821, + "loss": 0.89797354, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.30126953, + "step": 1292, + "time_per_iteration": 2.8186981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06009781, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.06335011869227863, + "language_loss": 0.84085584, + "learning_rate": 0.0008796865602168447, + "loss": 0.85176343, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.30615234, + "step": 1293, + "time_per_iteration": 2.5642354488372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06218874, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.055204532335327836, + "language_loss": 0.88449144, + "learning_rate": 0.0008794837798365115, + "loss": 0.89542329, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.30957031, + "step": 1294, + "time_per_iteration": 2.640967607498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_mlp": 1.07256651, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05342912575045942, + "language_loss": 0.88282919, + "learning_rate": 0.0008792808521275089, + "loss": 0.8938638, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.30859375, + "step": 1295, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106969, + "balance_loss_mlp": 1.07638037, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.05542201073335728, + "language_loss": 0.87427896, + "learning_rate": 0.0008790777771686206, + "loss": 0.88534868, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.30541992, + "step": 1296, + "time_per_iteration": 2.5764553546905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_mlp": 1.07934809, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.061211557913471215, + "language_loss": 0.85332036, + "learning_rate": 0.0008788745550386872, + "loss": 0.86441755, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.30322266, + "step": 1297, + "time_per_iteration": 2.635064125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_mlp": 1.08226037, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.055423812451341224, + "language_loss": 0.79893327, + "learning_rate": 0.0008786711858166063, + "loss": 0.81006682, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.31054688, + "step": 1298, + "time_per_iteration": 3.002070903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113917, + "balance_loss_mlp": 1.08387578, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.06342841372026603, + "language_loss": 0.8358891, + "learning_rate": 0.0008784676695813332, + "loss": 0.84702826, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.29980469, + "step": 1299, + "time_per_iteration": 2.941793918609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116177, + "balance_loss_mlp": 1.08573055, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.05313888632052142, + "language_loss": 0.84205985, + "learning_rate": 0.0008782640064118796, + "loss": 0.85322165, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.30395508, + "step": 1300, + "time_per_iteration": 2.9038445949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113921, + "balance_loss_mlp": 1.11441469, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.03742785755303804, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323961, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.24804688, + "step": 1301, + "time_per_iteration": 4.97193169593811 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.0781548, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.06725713094725487, + "language_loss": 0.86707664, + "learning_rate": 0.0008778562395867648, + "loss": 0.87815738, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.29882812, + "step": 1302, + "time_per_iteration": 2.6434335708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109494, + "balance_loss_mlp": 1.064852, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.0573305289073435, + "language_loss": 0.83713615, + "learning_rate": 0.0008776521360894127, + "loss": 0.84808552, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.30029297, + "step": 1303, + "time_per_iteration": 2.664281129837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_mlp": 1.06206167, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.030879512397293623, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80049491, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.25390625, + "step": 1304, + "time_per_iteration": 4.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_mlp": 1.06682515, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.05889583885024225, + "language_loss": 0.90380585, + "learning_rate": 0.0008772434893213186, + "loss": 0.91477358, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.29882812, + "step": 1305, + "time_per_iteration": 2.619591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.06228364, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.05643683756415757, + "language_loss": 0.84055364, + "learning_rate": 0.0008770389462092276, + "loss": 0.85148358, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.30664062, + "step": 1306, + "time_per_iteration": 2.646378517150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_mlp": 1.05860949, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.07421628365380602, + "language_loss": 0.86343837, + "learning_rate": 0.0008768342567176357, + "loss": 0.87434107, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.31640625, + "step": 1307, + "time_per_iteration": 2.807349681854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_mlp": 1.0562675, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.06024308313144323, + "language_loss": 0.90521109, + "learning_rate": 0.0008766294209260107, + "loss": 0.91610324, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.32958984, + "step": 1308, + "time_per_iteration": 2.652209758758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_mlp": 1.05510211, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.07044022402077256, + "language_loss": 0.90948963, + "learning_rate": 0.0008764244389138767, + "loss": 0.92035961, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.31884766, + "step": 1309, + "time_per_iteration": 2.583214044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05386305, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.07007920023055086, + "language_loss": 0.82157373, + "learning_rate": 0.000876219310760815, + "loss": 0.83244258, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.33032227, + "step": 1310, + "time_per_iteration": 2.8652145862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_mlp": 1.05956042, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05921747328918915, + "language_loss": 0.81032491, + "learning_rate": 0.0008760140365464631, + "loss": 0.82124686, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.32641602, + "step": 1311, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05799365, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06933033432447253, + "language_loss": 0.87204492, + "learning_rate": 0.0008758086163505156, + "loss": 0.88295335, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.32861328, + "step": 1312, + "time_per_iteration": 2.5809056758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085438, + "balance_loss_mlp": 1.05253649, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.05785086559723577, + "language_loss": 0.89221275, + "learning_rate": 0.0008756030502527239, + "loss": 0.90306717, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.32910156, + "step": 1313, + "time_per_iteration": 2.8305885791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_mlp": 1.05201209, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05540107069612798, + "language_loss": 0.90540659, + "learning_rate": 0.0008753973383328954, + "loss": 0.91624713, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.3203125, + "step": 1314, + "time_per_iteration": 2.8095338344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_mlp": 1.0518887, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.06960735937341114, + "language_loss": 0.83534479, + "learning_rate": 0.0008751914806708952, + "loss": 0.84618747, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.32373047, + "step": 1315, + "time_per_iteration": 2.6356046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_mlp": 1.05357838, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.05966295966929829, + "language_loss": 0.82178831, + "learning_rate": 0.0008749854773466439, + "loss": 0.83263648, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.31201172, + "step": 1316, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083614, + "balance_loss_mlp": 1.05199969, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.060440864571565875, + "language_loss": 0.84378719, + "learning_rate": 0.0008747793284401192, + "loss": 0.85462332, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.31591797, + "step": 1317, + "time_per_iteration": 2.672581195831299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04701352, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.06760844062466466, + "language_loss": 0.85858786, + "learning_rate": 0.0008745730340313551, + "loss": 0.8693741, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.31591797, + "step": 1318, + "time_per_iteration": 2.7483184337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_mlp": 1.05775118, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.06356165501521222, + "language_loss": 0.84280074, + "learning_rate": 0.0008743665942004422, + "loss": 0.85368681, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.30834961, + "step": 1319, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094218, + "balance_loss_mlp": 1.06362879, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.06511177952096096, + "language_loss": 0.92719352, + "learning_rate": 0.0008741600090275277, + "loss": 0.93813574, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.30541992, + "step": 1320, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_mlp": 1.05758274, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.06459884228420558, + "language_loss": 0.84290528, + "learning_rate": 0.0008739532785928151, + "loss": 0.853791, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.30957031, + "step": 1321, + "time_per_iteration": 3.438142776489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166929, + "balance_loss_mlp": 1.14528096, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.062216562760273944, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7606051, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.21679688, + "step": 1322, + "time_per_iteration": 4.881207466125488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109523, + "balance_loss_mlp": 1.06502271, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.0660267567978659, + "language_loss": 0.8296389, + "learning_rate": 0.0008735393822590908, + "loss": 0.84059119, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.30151367, + "step": 1323, + "time_per_iteration": 2.7254581451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_mlp": 1.06723142, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.07409821223339019, + "language_loss": 0.87412238, + "learning_rate": 0.0008733322165207681, + "loss": 0.88509512, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.30029297, + "step": 1324, + "time_per_iteration": 2.6910648345947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_mlp": 1.07295775, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.06686348955430095, + "language_loss": 0.83012944, + "learning_rate": 0.0008731249058420247, + "loss": 0.84115636, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.29663086, + "step": 1325, + "time_per_iteration": 3.0301432609558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_mlp": 1.07499993, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.057218587703981125, + "language_loss": 0.90547103, + "learning_rate": 0.0008729174503033459, + "loss": 0.91652811, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.30664062, + "step": 1326, + "time_per_iteration": 2.668544292449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07706285, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.08872727493885958, + "language_loss": 0.82430828, + "learning_rate": 0.0008727098499852728, + "loss": 0.83538437, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.30493164, + "step": 1327, + "time_per_iteration": 2.8206427097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102439, + "balance_loss_mlp": 1.07175469, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.05995612334517853, + "language_loss": 0.8945381, + "learning_rate": 0.0008725021049684034, + "loss": 0.90556252, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.30639648, + "step": 1328, + "time_per_iteration": 2.7788021564483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110018, + "balance_loss_mlp": 1.06906641, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.07693053452424695, + "language_loss": 0.82675111, + "learning_rate": 0.000872294215333391, + "loss": 0.83775294, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.31079102, + "step": 1329, + "time_per_iteration": 3.208423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089607, + "balance_loss_mlp": 1.05820751, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05833009001407562, + "language_loss": 0.83099753, + "learning_rate": 0.0008720861811609457, + "loss": 0.84189361, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.3137207, + "step": 1330, + "time_per_iteration": 2.723451614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082701, + "balance_loss_mlp": 1.05122948, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.06841234134213905, + "language_loss": 0.83759737, + "learning_rate": 0.0008718780025318338, + "loss": 0.84842432, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.31445312, + "step": 1331, + "time_per_iteration": 2.7594637870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.05244088, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.059488371229756976, + "language_loss": 0.83890998, + "learning_rate": 0.0008716696795268771, + "loss": 0.84975058, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.31591797, + "step": 1332, + "time_per_iteration": 2.719435453414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.05516648, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.09040651922247907, + "language_loss": 0.85621184, + "learning_rate": 0.0008714612122269538, + "loss": 0.86707628, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.3125, + "step": 1333, + "time_per_iteration": 2.846071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087221, + "balance_loss_mlp": 1.05517721, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.06079891504044088, + "language_loss": 0.8881824, + "learning_rate": 0.0008712526007129982, + "loss": 0.89905459, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.3203125, + "step": 1334, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_mlp": 1.05226636, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06135189476637687, + "language_loss": 0.90600282, + "learning_rate": 0.0008710438450660003, + "loss": 0.91684425, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.31835938, + "step": 1335, + "time_per_iteration": 2.6957638263702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_mlp": 1.04984844, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.09152684925001835, + "language_loss": 0.86861122, + "learning_rate": 0.0008708349453670064, + "loss": 0.87942821, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.31835938, + "step": 1336, + "time_per_iteration": 2.569918632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.04854655, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.055029840901202824, + "language_loss": 0.91123867, + "learning_rate": 0.0008706259016971185, + "loss": 0.92204076, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.31640625, + "step": 1337, + "time_per_iteration": 2.7755186557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_mlp": 1.04554725, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.08019888390454845, + "language_loss": 0.82668757, + "learning_rate": 0.0008704167141374944, + "loss": 0.83746326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.32006836, + "step": 1338, + "time_per_iteration": 2.8559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073148, + "balance_loss_mlp": 1.04184318, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06412343972447931, + "language_loss": 0.88389909, + "learning_rate": 0.0008702073827693482, + "loss": 0.89463055, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.31274414, + "step": 1339, + "time_per_iteration": 2.725090265274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_mlp": 1.04662943, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06471871877048396, + "language_loss": 0.88798392, + "learning_rate": 0.0008699979076739494, + "loss": 0.89876378, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.31323242, + "step": 1340, + "time_per_iteration": 2.9663493633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.04354882, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.0844279622703065, + "language_loss": 0.88438749, + "learning_rate": 0.0008697882889326234, + "loss": 0.89513433, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.31103516, + "step": 1341, + "time_per_iteration": 2.5622262954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05047798, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.07114901487039385, + "language_loss": 0.86560714, + "learning_rate": 0.0008695785266267515, + "loss": 0.87642074, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.30834961, + "step": 1342, + "time_per_iteration": 2.7169957160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_mlp": 1.05309629, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06303738321086937, + "language_loss": 0.82804394, + "learning_rate": 0.0008693686208377704, + "loss": 0.83887577, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.30053711, + "step": 1343, + "time_per_iteration": 2.8591935634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_mlp": 1.06142426, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06465186244058573, + "language_loss": 0.88812125, + "learning_rate": 0.0008691585716471733, + "loss": 0.89902723, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.29150391, + "step": 1344, + "time_per_iteration": 2.6713430881500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099449, + "balance_loss_mlp": 1.07119632, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.0588719911399204, + "language_loss": 0.85261089, + "learning_rate": 0.0008689483791365079, + "loss": 0.86360538, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.28271484, + "step": 1345, + "time_per_iteration": 2.820528030395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.08457518, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.06280839806958106, + "language_loss": 0.89176255, + "learning_rate": 0.0008687380433873786, + "loss": 0.90288818, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.28027344, + "step": 1346, + "time_per_iteration": 2.8161351680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122151, + "balance_loss_mlp": 1.09442306, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.09019918884346267, + "language_loss": 0.82469404, + "learning_rate": 0.0008685275644814448, + "loss": 0.83591551, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.27734375, + "step": 1347, + "time_per_iteration": 2.693267822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_mlp": 1.09403384, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.0763626786758855, + "language_loss": 0.83996952, + "learning_rate": 0.0008683169425004216, + "loss": 0.85119361, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.28393555, + "step": 1348, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.07582057, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.0999879699530973, + "language_loss": 0.82942533, + "learning_rate": 0.0008681061775260799, + "loss": 0.84046841, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.28491211, + "step": 1349, + "time_per_iteration": 2.8389806747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104623, + "balance_loss_mlp": 1.0761795, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06848449496170159, + "language_loss": 0.9182089, + "learning_rate": 0.0008678952696402458, + "loss": 0.92925513, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.28442383, + "step": 1350, + "time_per_iteration": 2.520573377609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091244, + "balance_loss_mlp": 1.06270587, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.06363942150358032, + "language_loss": 0.86753285, + "learning_rate": 0.000867684218924801, + "loss": 0.87844533, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.28564453, + "step": 1351, + "time_per_iteration": 2.9015109539031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094999, + "balance_loss_mlp": 1.07382762, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.03643594447100183, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80042088, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.21191406, + "step": 1352, + "time_per_iteration": 4.897913217544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05987692, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.05004222260192376, + "language_loss": 0.8488791, + "learning_rate": 0.0008672616893328834, + "loss": 0.85977256, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.29394531, + "step": 1353, + "time_per_iteration": 2.930330991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_mlp": 1.05925155, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.06508424080641521, + "language_loss": 0.90170342, + "learning_rate": 0.0008670502106204512, + "loss": 0.91259539, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.29882812, + "step": 1354, + "time_per_iteration": 2.8581433296203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088042, + "balance_loss_mlp": 1.05821621, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.07357469643966064, + "language_loss": 0.81904948, + "learning_rate": 0.0008668385894064892, + "loss": 0.82992983, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.2980957, + "step": 1355, + "time_per_iteration": 2.6258199214935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086225, + "balance_loss_mlp": 1.05565977, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.05598612189883674, + "language_loss": 0.88435078, + "learning_rate": 0.0008666268257731562, + "loss": 0.89521307, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.30517578, + "step": 1356, + "time_per_iteration": 3.0935704708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_mlp": 1.06557548, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.05877228431721195, + "language_loss": 0.85582316, + "learning_rate": 0.0008664149198026662, + "loss": 0.86678505, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.3059082, + "step": 1357, + "time_per_iteration": 3.3150172233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093826, + "balance_loss_mlp": 1.06407189, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.08010917030088013, + "language_loss": 0.88609982, + "learning_rate": 0.0008662028715772883, + "loss": 0.8970381, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.29736328, + "step": 1358, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117948, + "balance_loss_mlp": 1.08781219, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.068011575409632, + "language_loss": 0.8599565, + "learning_rate": 0.0008659906811793467, + "loss": 0.87113595, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.30078125, + "step": 1359, + "time_per_iteration": 2.6895272731781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120144, + "balance_loss_mlp": 1.08917356, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06541737550876531, + "language_loss": 0.89626461, + "learning_rate": 0.0008657783486912215, + "loss": 0.90746599, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.30932617, + "step": 1360, + "time_per_iteration": 2.762763738632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_mlp": 1.09752679, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.08393806981558949, + "language_loss": 0.89884281, + "learning_rate": 0.0008655658741953472, + "loss": 0.91012919, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.31079102, + "step": 1361, + "time_per_iteration": 3.2099156379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108189, + "balance_loss_mlp": 1.07740927, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.05266132623937494, + "language_loss": 0.88221049, + "learning_rate": 0.0008653532577742136, + "loss": 0.89329231, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.30761719, + "step": 1362, + "time_per_iteration": 2.6699323654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097872, + "balance_loss_mlp": 1.06756878, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.06436829867728516, + "language_loss": 0.86740243, + "learning_rate": 0.0008651404995103659, + "loss": 0.87838113, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.30273438, + "step": 1363, + "time_per_iteration": 2.5310258865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094148, + "balance_loss_mlp": 1.06286716, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.05795299669830668, + "language_loss": 0.8642996, + "learning_rate": 0.0008649275994864041, + "loss": 0.87524116, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.3125, + "step": 1364, + "time_per_iteration": 2.675330638885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_mlp": 1.07066512, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05147405231292679, + "language_loss": 0.83778602, + "learning_rate": 0.0008647145577849834, + "loss": 0.84880447, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.31152344, + "step": 1365, + "time_per_iteration": 2.817330837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06913614, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.05119291352940178, + "language_loss": 0.82886052, + "learning_rate": 0.0008645013744888139, + "loss": 0.83985633, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.30395508, + "step": 1366, + "time_per_iteration": 2.9056894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093325, + "balance_loss_mlp": 1.06318903, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.08887633390516779, + "language_loss": 0.8772788, + "learning_rate": 0.0008642880496806607, + "loss": 0.88821203, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.30102539, + "step": 1367, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.0635649, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.0720053964715196, + "language_loss": 0.84128964, + "learning_rate": 0.0008640745834433437, + "loss": 0.85223687, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.3112793, + "step": 1368, + "time_per_iteration": 2.7703893184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_mlp": 1.05559897, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.058958451803685384, + "language_loss": 0.86905044, + "learning_rate": 0.000863860975859738, + "loss": 0.87990516, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.29833984, + "step": 1369, + "time_per_iteration": 2.913543224334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06309724, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.07885033776141591, + "language_loss": 0.87845421, + "learning_rate": 0.0008636472270127733, + "loss": 0.8893891, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.3034668, + "step": 1370, + "time_per_iteration": 2.6615941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093443, + "balance_loss_mlp": 1.06368852, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.06686078076555955, + "language_loss": 0.90047085, + "learning_rate": 0.0008634333369854345, + "loss": 0.91140521, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.29736328, + "step": 1371, + "time_per_iteration": 2.611501932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109652, + "balance_loss_mlp": 1.06666958, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05135890593758564, + "language_loss": 0.87519878, + "learning_rate": 0.0008632193058607608, + "loss": 0.88616395, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.29833984, + "step": 1372, + "time_per_iteration": 2.7420408725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096239, + "balance_loss_mlp": 1.06681848, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.07070265457366111, + "language_loss": 0.80896008, + "learning_rate": 0.0008630051337218466, + "loss": 0.81992251, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.29394531, + "step": 1373, + "time_per_iteration": 2.694157123565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097092, + "balance_loss_mlp": 1.06762338, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.06318549857397857, + "language_loss": 0.8188293, + "learning_rate": 0.0008627908206518409, + "loss": 0.82980019, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.29418945, + "step": 1374, + "time_per_iteration": 2.703380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_mlp": 1.00330341, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.017765090827900253, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76174676, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.20117188, + "step": 1375, + "time_per_iteration": 4.995063781738281 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06237197, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.0561933760173491, + "language_loss": 0.9114545, + "learning_rate": 0.0008623617720514241, + "loss": 0.92238057, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.30224609, + "step": 1376, + "time_per_iteration": 2.666578769683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093572, + "balance_loss_mlp": 1.06276798, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.06268473823371516, + "language_loss": 0.84907627, + "learning_rate": 0.0008621470366875848, + "loss": 0.86001205, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.30761719, + "step": 1377, + "time_per_iteration": 2.576968193054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087995, + "balance_loss_mlp": 1.05661869, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05801174228437736, + "language_loss": 0.87514544, + "learning_rate": 0.0008619321607257966, + "loss": 0.88602537, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.31347656, + "step": 1378, + "time_per_iteration": 2.6873912811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05396187, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.06612008054140536, + "language_loss": 0.81601393, + "learning_rate": 0.000861717144249482, + "loss": 0.82685226, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.2980957, + "step": 1379, + "time_per_iteration": 2.861531972885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082319, + "balance_loss_mlp": 1.05220687, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06041061044303736, + "language_loss": 0.89415485, + "learning_rate": 0.0008615019873421175, + "loss": 0.90497804, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.30053711, + "step": 1380, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080185, + "balance_loss_mlp": 1.04973865, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.12029414194163875, + "language_loss": 0.85435975, + "learning_rate": 0.0008612866900872349, + "loss": 0.86516166, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.30395508, + "step": 1381, + "time_per_iteration": 2.5492422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078246, + "balance_loss_mlp": 1.0483005, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.06111803920627532, + "language_loss": 0.87957448, + "learning_rate": 0.0008610712525684197, + "loss": 0.89035696, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.29882812, + "step": 1382, + "time_per_iteration": 2.632847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_mlp": 1.05356061, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.07781171288722535, + "language_loss": 0.84130585, + "learning_rate": 0.0008608556748693121, + "loss": 0.85214543, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.3034668, + "step": 1383, + "time_per_iteration": 3.246919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.05522013, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.052993237489823604, + "language_loss": 0.85963714, + "learning_rate": 0.000860639957073607, + "loss": 0.87050641, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.31689453, + "step": 1384, + "time_per_iteration": 2.7504889965057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086729, + "balance_loss_mlp": 1.05537665, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.06878538642870029, + "language_loss": 0.87610686, + "learning_rate": 0.0008604240992650534, + "loss": 0.88697416, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.31347656, + "step": 1385, + "time_per_iteration": 2.6546881198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082661, + "balance_loss_mlp": 1.05135679, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.05853696199287041, + "language_loss": 0.89197159, + "learning_rate": 0.0008602081015274545, + "loss": 0.90279818, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.31274414, + "step": 1386, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091919, + "balance_loss_mlp": 1.06061459, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.05264786586341277, + "language_loss": 0.83147365, + "learning_rate": 0.0008599919639446684, + "loss": 0.8423928, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.31274414, + "step": 1387, + "time_per_iteration": 2.6775026321411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_mlp": 1.06126583, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06747698326814106, + "language_loss": 0.79790741, + "learning_rate": 0.000859775686600607, + "loss": 0.80884051, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.3203125, + "step": 1388, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090634, + "balance_loss_mlp": 1.05921042, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.06336986871451572, + "language_loss": 0.84764999, + "learning_rate": 0.0008595592695792367, + "loss": 0.85855639, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.31396484, + "step": 1389, + "time_per_iteration": 2.6549055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.06593931, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.055901377362424544, + "language_loss": 0.90619266, + "learning_rate": 0.0008593427129645778, + "loss": 0.91716409, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.31176758, + "step": 1390, + "time_per_iteration": 2.6070477962493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_mlp": 1.06542134, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.06788313950064188, + "language_loss": 0.85213327, + "learning_rate": 0.0008591260168407052, + "loss": 0.86309791, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.31005859, + "step": 1391, + "time_per_iteration": 2.794921398162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_mlp": 1.05963671, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.052723370404498295, + "language_loss": 0.82993329, + "learning_rate": 0.0008589091812917479, + "loss": 0.84085703, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.32739258, + "step": 1392, + "time_per_iteration": 2.634734869003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088674, + "balance_loss_mlp": 1.05727446, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.06846284491975779, + "language_loss": 0.85420829, + "learning_rate": 0.0008586922064018887, + "loss": 0.86509502, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.3137207, + "step": 1393, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108591, + "balance_loss_mlp": 1.05408156, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.07721778370466406, + "language_loss": 0.89049023, + "learning_rate": 0.0008584750922553651, + "loss": 0.90134937, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.31811523, + "step": 1394, + "time_per_iteration": 3.15010666847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_mlp": 1.05053067, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.054821616219537066, + "language_loss": 0.83275163, + "learning_rate": 0.0008582578389364677, + "loss": 0.8435728, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.31567383, + "step": 1395, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086932, + "balance_loss_mlp": 1.05469775, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.049938668546041676, + "language_loss": 0.91772366, + "learning_rate": 0.0008580404465295422, + "loss": 0.92859298, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.32226562, + "step": 1396, + "time_per_iteration": 2.8488125801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_mlp": 1.04891562, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.06204428603549851, + "language_loss": 0.87966394, + "learning_rate": 0.0008578229151189876, + "loss": 0.89045662, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.30297852, + "step": 1397, + "time_per_iteration": 2.92258620262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081241, + "balance_loss_mlp": 1.04867268, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.06429333021146523, + "language_loss": 0.81249309, + "learning_rate": 0.0008576052447892573, + "loss": 0.82330555, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.32568359, + "step": 1398, + "time_per_iteration": 2.551042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.05163908, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.0671833421183549, + "language_loss": 0.86040235, + "learning_rate": 0.000857387435624858, + "loss": 0.87124133, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.32250977, + "step": 1399, + "time_per_iteration": 2.5816056728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086843, + "balance_loss_mlp": 1.05382252, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.05003222473195782, + "language_loss": 0.87953913, + "learning_rate": 0.0008571694877103513, + "loss": 0.89040762, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.33032227, + "step": 1400, + "time_per_iteration": 3.256469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108756, + "balance_loss_mlp": 1.05542135, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.056643414184275494, + "language_loss": 0.87665725, + "learning_rate": 0.0008569514011303515, + "loss": 0.88753277, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.32128906, + "step": 1401, + "time_per_iteration": 2.782273054122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_mlp": 1.05275857, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06127144796082157, + "language_loss": 0.8767277, + "learning_rate": 0.0008567331759695277, + "loss": 0.88757378, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.31835938, + "step": 1402, + "time_per_iteration": 2.696514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_mlp": 1.05178595, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.07491599518741582, + "language_loss": 0.86524475, + "learning_rate": 0.0008565148123126023, + "loss": 0.87609023, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.32763672, + "step": 1403, + "time_per_iteration": 2.6686785221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088194, + "balance_loss_mlp": 1.05510116, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.050644669708274456, + "language_loss": 0.8574301, + "learning_rate": 0.0008562963102443516, + "loss": 0.86831206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.33105469, + "step": 1404, + "time_per_iteration": 2.693836212158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05232334, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.06951419199959312, + "language_loss": 0.84958577, + "learning_rate": 0.0008560776698496056, + "loss": 0.8604449, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.33618164, + "step": 1405, + "time_per_iteration": 2.892805814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_mlp": 1.05093896, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.07287556066439085, + "language_loss": 0.85794389, + "learning_rate": 0.0008558588912132481, + "loss": 0.8687861, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.33300781, + "step": 1406, + "time_per_iteration": 2.821922540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098005, + "balance_loss_mlp": 1.07587957, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.044578698770804955, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77556992, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.22167969, + "step": 1407, + "time_per_iteration": 4.952622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082949, + "balance_loss_mlp": 1.05016637, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.05991157104862915, + "language_loss": 0.82959783, + "learning_rate": 0.0008554209195555016, + "loss": 0.84042734, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.32788086, + "step": 1408, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_mlp": 1.05403042, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.06960051295953752, + "language_loss": 0.88047969, + "learning_rate": 0.0008552017267041483, + "loss": 0.89133757, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.31738281, + "step": 1409, + "time_per_iteration": 2.7926084995269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_mlp": 1.06134176, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.07424010893339522, + "language_loss": 0.8324914, + "learning_rate": 0.0008549823959512549, + "loss": 0.8434236, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.31860352, + "step": 1410, + "time_per_iteration": 2.660325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.06724083, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.062062202361739795, + "language_loss": 0.86755967, + "learning_rate": 0.0008547629273819728, + "loss": 0.87854296, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.31054688, + "step": 1411, + "time_per_iteration": 3.3994545936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098737, + "balance_loss_mlp": 1.06736147, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06335672358829844, + "language_loss": 0.83453959, + "learning_rate": 0.0008545433210815074, + "loss": 0.84552693, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.31347656, + "step": 1412, + "time_per_iteration": 2.644434690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_mlp": 1.07123613, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.06340025797507488, + "language_loss": 0.87345338, + "learning_rate": 0.0008543235771351176, + "loss": 0.88448215, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.31616211, + "step": 1413, + "time_per_iteration": 2.7854721546173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098411, + "balance_loss_mlp": 1.0675596, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.05399278560092938, + "language_loss": 0.84545946, + "learning_rate": 0.0008541036956281154, + "loss": 0.85644352, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.30834961, + "step": 1414, + "time_per_iteration": 2.8788704872131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_mlp": 1.06056201, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.07883268546047513, + "language_loss": 0.81883514, + "learning_rate": 0.0008538836766458665, + "loss": 0.82975471, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.3137207, + "step": 1415, + "time_per_iteration": 2.8526153564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087599, + "balance_loss_mlp": 1.05732012, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.060849568603238105, + "language_loss": 0.84889638, + "learning_rate": 0.0008536635202737897, + "loss": 0.85977244, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.30224609, + "step": 1416, + "time_per_iteration": 2.837353467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_mlp": 1.05903983, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.07898075745209039, + "language_loss": 0.82057679, + "learning_rate": 0.0008534432265973573, + "loss": 0.83147448, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.30688477, + "step": 1417, + "time_per_iteration": 2.5948355197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091815, + "balance_loss_mlp": 1.05891299, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.06605458024108496, + "language_loss": 0.87714171, + "learning_rate": 0.000853222795702095, + "loss": 0.88805991, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.32910156, + "step": 1418, + "time_per_iteration": 3.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109188, + "balance_loss_mlp": 1.05842948, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.04642939327926388, + "language_loss": 0.83471483, + "learning_rate": 0.0008530022276735813, + "loss": 0.84563363, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.33447266, + "step": 1419, + "time_per_iteration": 2.711695432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_mlp": 1.05293703, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.05938997521105461, + "language_loss": 0.85724676, + "learning_rate": 0.0008527815225974489, + "loss": 0.86811179, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.3359375, + "step": 1420, + "time_per_iteration": 2.648448944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_mlp": 1.05407453, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.07492898694353861, + "language_loss": 0.87982917, + "learning_rate": 0.0008525606805593829, + "loss": 0.89069438, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.32446289, + "step": 1421, + "time_per_iteration": 2.4182560443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_mlp": 1.04997277, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.06962089633364145, + "language_loss": 0.82760686, + "learning_rate": 0.0008523397016451213, + "loss": 0.83843112, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.32446289, + "step": 1422, + "time_per_iteration": 2.587892532348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05021799, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.053513553181154576, + "language_loss": 0.8711561, + "learning_rate": 0.0008521185859404564, + "loss": 0.88199091, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.33276367, + "step": 1423, + "time_per_iteration": 3.372192859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_mlp": 1.0513202, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.059986100163812936, + "language_loss": 0.89238524, + "learning_rate": 0.0008518973335312326, + "loss": 0.90323293, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.33447266, + "step": 1424, + "time_per_iteration": 2.791482448577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082662, + "balance_loss_mlp": 1.04921198, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.06956472940992567, + "language_loss": 0.8333236, + "learning_rate": 0.0008516759445033477, + "loss": 0.84415025, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.3347168, + "step": 1425, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082757, + "balance_loss_mlp": 1.05088091, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.0615305422895171, + "language_loss": 0.84459686, + "learning_rate": 0.0008514544189427526, + "loss": 0.85542446, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.31860352, + "step": 1426, + "time_per_iteration": 2.797384738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_mlp": 1.06143463, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061840511174045036, + "language_loss": 0.86558306, + "learning_rate": 0.0008512327569354511, + "loss": 0.87652624, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.32885742, + "step": 1427, + "time_per_iteration": 2.533623695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06418157, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.06551541099381472, + "language_loss": 0.83328068, + "learning_rate": 0.0008510109585675001, + "loss": 0.84424412, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.3215332, + "step": 1428, + "time_per_iteration": 2.623915672302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10653293, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.06717437310459566, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82279044, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.19140625, + "step": 1429, + "time_per_iteration": 4.737167596817017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096832, + "balance_loss_mlp": 1.06517005, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.06718416370196487, + "language_loss": 0.80457842, + "learning_rate": 0.0008505669530941415, + "loss": 0.81554675, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.31640625, + "step": 1430, + "time_per_iteration": 3.380617141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_mlp": 1.07169294, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.06498994038544256, + "language_loss": 0.83560073, + "learning_rate": 0.000850344746161112, + "loss": 0.8466357, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.31787109, + "step": 1431, + "time_per_iteration": 2.5917775630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_mlp": 1.06883883, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.06649249705457211, + "language_loss": 0.87664711, + "learning_rate": 0.0008501224032121894, + "loss": 0.88765645, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.32080078, + "step": 1432, + "time_per_iteration": 2.493826150894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_mlp": 1.06906962, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.06530156063230687, + "language_loss": 0.8172394, + "learning_rate": 0.0008498999243336946, + "loss": 0.82825768, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.32763672, + "step": 1433, + "time_per_iteration": 2.625955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_mlp": 1.07275844, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.056445052388478564, + "language_loss": 0.87110436, + "learning_rate": 0.0008496773096120021, + "loss": 0.88214689, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.31469727, + "step": 1434, + "time_per_iteration": 2.8644402027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093048, + "balance_loss_mlp": 1.06169593, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.07767765628739494, + "language_loss": 0.84306771, + "learning_rate": 0.0008494545591335381, + "loss": 0.85399818, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.31323242, + "step": 1435, + "time_per_iteration": 2.9069130420684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094657, + "balance_loss_mlp": 1.06366265, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04344696113506711, + "language_loss": 0.86938953, + "learning_rate": 0.0008492316729847823, + "loss": 0.88033605, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.30957031, + "step": 1436, + "time_per_iteration": 2.844926595687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091812, + "balance_loss_mlp": 1.06050754, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055139322891005815, + "language_loss": 0.79749823, + "learning_rate": 0.0008490086512522664, + "loss": 0.80841637, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.31274414, + "step": 1437, + "time_per_iteration": 2.722158670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092682, + "balance_loss_mlp": 1.06121063, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.06334111858493886, + "language_loss": 0.90728873, + "learning_rate": 0.0008487854940225755, + "loss": 0.91821557, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.31445312, + "step": 1438, + "time_per_iteration": 2.43622088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.05991077, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.05907133214000555, + "language_loss": 0.89962572, + "learning_rate": 0.0008485622013823466, + "loss": 0.91054124, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.31616211, + "step": 1439, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093806, + "balance_loss_mlp": 1.06154847, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.06492331678063241, + "language_loss": 0.82635379, + "learning_rate": 0.00084833877341827, + "loss": 0.83729184, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.32250977, + "step": 1440, + "time_per_iteration": 2.625870704650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092721, + "balance_loss_mlp": 1.06139278, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.06674971698169922, + "language_loss": 0.80478823, + "learning_rate": 0.000848115210217088, + "loss": 0.81571543, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.31298828, + "step": 1441, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086558, + "balance_loss_mlp": 1.05410933, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.055312199129178424, + "language_loss": 0.81684244, + "learning_rate": 0.0008478915118655952, + "loss": 0.82770801, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.32446289, + "step": 1442, + "time_per_iteration": 2.714303493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089692, + "balance_loss_mlp": 1.05710077, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.049794988647852687, + "language_loss": 0.86386287, + "learning_rate": 0.0008476676784506393, + "loss": 0.87475979, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.32592773, + "step": 1443, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_mlp": 1.05664372, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.05900532389488003, + "language_loss": 0.82031631, + "learning_rate": 0.0008474437100591201, + "loss": 0.83119631, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.31323242, + "step": 1444, + "time_per_iteration": 3.3359997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084677, + "balance_loss_mlp": 1.05160809, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.054436577911169556, + "language_loss": 0.85231566, + "learning_rate": 0.0008472196067779898, + "loss": 0.86316246, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.33081055, + "step": 1445, + "time_per_iteration": 2.7946455478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080884, + "balance_loss_mlp": 1.04850721, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.08667298623079295, + "language_loss": 0.85239732, + "learning_rate": 0.0008469953686942531, + "loss": 0.86320615, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.32373047, + "step": 1446, + "time_per_iteration": 3.0761613845825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.04927349, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.07591437330096602, + "language_loss": 0.8283245, + "learning_rate": 0.0008467709958949668, + "loss": 0.83914101, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.32373047, + "step": 1447, + "time_per_iteration": 2.7922093868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.0504328, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.0636917665663464, + "language_loss": 0.86192262, + "learning_rate": 0.0008465464884672403, + "loss": 0.8727442, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.31713867, + "step": 1448, + "time_per_iteration": 2.679574966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_mlp": 1.05211091, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06494062959974968, + "language_loss": 0.85664314, + "learning_rate": 0.0008463218464982348, + "loss": 0.86748445, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.32006836, + "step": 1449, + "time_per_iteration": 2.8746044635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05524611, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.05859002353759583, + "language_loss": 0.87554371, + "learning_rate": 0.0008460970700751645, + "loss": 0.88640976, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.31323242, + "step": 1450, + "time_per_iteration": 3.0630292892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.05447531, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06644970008868617, + "language_loss": 0.8732717, + "learning_rate": 0.000845872159285295, + "loss": 0.8841247, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.30786133, + "step": 1451, + "time_per_iteration": 2.7334539890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149095, + "balance_loss_mlp": 1.13173842, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.04059568749878616, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78915942, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17382812, + "step": 1452, + "time_per_iteration": 4.913143634796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087672, + "balance_loss_mlp": 1.05617714, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05755695164820471, + "language_loss": 0.86085773, + "learning_rate": 0.0008454219349544836, + "loss": 0.87173438, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.31469727, + "step": 1453, + "time_per_iteration": 3.3649299144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086718, + "balance_loss_mlp": 1.05569983, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.059728326526783365, + "language_loss": 0.8137995, + "learning_rate": 0.000845196621588334, + "loss": 0.82466674, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.30981445, + "step": 1454, + "time_per_iteration": 2.7774734497070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_mlp": 1.05095196, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.0559695634724148, + "language_loss": 0.76184201, + "learning_rate": 0.0008449711742049706, + "loss": 0.77266252, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.31054688, + "step": 1455, + "time_per_iteration": 2.75393009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_mlp": 1.04814696, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.06397369460964857, + "language_loss": 0.83309555, + "learning_rate": 0.0008447455928919196, + "loss": 0.84389246, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.31518555, + "step": 1456, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082481, + "balance_loss_mlp": 1.05177259, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.06274060179370718, + "language_loss": 0.86886203, + "learning_rate": 0.0008445198777367595, + "loss": 0.87968683, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.30664062, + "step": 1457, + "time_per_iteration": 2.6488282680511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089589, + "balance_loss_mlp": 1.05883336, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.06557026121847803, + "language_loss": 0.8106361, + "learning_rate": 0.0008442940288271208, + "loss": 0.82153201, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.30712891, + "step": 1458, + "time_per_iteration": 2.67258882522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096326, + "balance_loss_mlp": 1.06454456, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.07361561415976156, + "language_loss": 0.86939961, + "learning_rate": 0.0008440680462506856, + "loss": 0.88036287, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.31762695, + "step": 1459, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_mlp": 1.07354569, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.05419081251366802, + "language_loss": 0.86197531, + "learning_rate": 0.0008438419300951883, + "loss": 0.87302566, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.31469727, + "step": 1460, + "time_per_iteration": 2.6306796073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_mlp": 1.07459426, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.08520166677325354, + "language_loss": 0.8634038, + "learning_rate": 0.0008436156804484148, + "loss": 0.87446761, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.31762695, + "step": 1461, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.0698266, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.06649626079325978, + "language_loss": 0.88025403, + "learning_rate": 0.0008433892973982031, + "loss": 0.89127588, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.32348633, + "step": 1462, + "time_per_iteration": 2.572810173034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110576, + "balance_loss_mlp": 1.07333505, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06397092621415032, + "language_loss": 0.85030043, + "learning_rate": 0.0008431627810324431, + "loss": 0.86135799, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.32421875, + "step": 1463, + "time_per_iteration": 2.6855740547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_mlp": 1.0774579, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.06457367310459801, + "language_loss": 0.81006026, + "learning_rate": 0.000842936131439076, + "loss": 0.82115412, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.3190918, + "step": 1464, + "time_per_iteration": 2.5868756771087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_mlp": 1.07188725, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06483114531916107, + "language_loss": 0.87564301, + "learning_rate": 0.0008427093487060951, + "loss": 0.88666582, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.3034668, + "step": 1465, + "time_per_iteration": 2.6775078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.07294393, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05163652452488039, + "language_loss": 0.84608126, + "learning_rate": 0.000842482432921545, + "loss": 0.85712349, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.3125, + "step": 1466, + "time_per_iteration": 2.844379186630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090816, + "balance_loss_mlp": 1.05955911, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.05726454257462379, + "language_loss": 0.86823475, + "learning_rate": 0.0008422553841735225, + "loss": 0.87914288, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.31225586, + "step": 1467, + "time_per_iteration": 2.4838902950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05624461, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.07863392491108157, + "language_loss": 0.8442952, + "learning_rate": 0.0008420282025501757, + "loss": 0.85516858, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.31054688, + "step": 1468, + "time_per_iteration": 2.7528913021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108248, + "balance_loss_mlp": 1.05169988, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.056003117579575636, + "language_loss": 0.852718, + "learning_rate": 0.0008418008881397043, + "loss": 0.86354285, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.30737305, + "step": 1469, + "time_per_iteration": 2.6801319122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078886, + "balance_loss_mlp": 1.0479157, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.04937894089719141, + "language_loss": 0.82587177, + "learning_rate": 0.0008415734410303595, + "loss": 0.83666062, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.30932617, + "step": 1470, + "time_per_iteration": 3.1880481243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04551327, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.053571151454841835, + "language_loss": 0.90790403, + "learning_rate": 0.0008413458613104444, + "loss": 0.91866791, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.30834961, + "step": 1471, + "time_per_iteration": 2.6801347732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.04832768, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.054274543729309115, + "language_loss": 0.82964969, + "learning_rate": 0.0008411181490683129, + "loss": 0.84044528, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.31201172, + "step": 1472, + "time_per_iteration": 2.732304096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107702, + "balance_loss_mlp": 1.04619205, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05901735675502878, + "language_loss": 0.82318664, + "learning_rate": 0.0008408903043923707, + "loss": 0.83395684, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.30786133, + "step": 1473, + "time_per_iteration": 3.0503528118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04906487, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.06313039437285956, + "language_loss": 0.81015414, + "learning_rate": 0.0008406623273710754, + "loss": 0.82095402, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.30883789, + "step": 1474, + "time_per_iteration": 2.606189727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05008459, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06295911479055617, + "language_loss": 0.82597101, + "learning_rate": 0.0008404342180929351, + "loss": 0.83678609, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.31396484, + "step": 1475, + "time_per_iteration": 2.620607614517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_mlp": 1.04222226, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06425181584365489, + "language_loss": 0.81938702, + "learning_rate": 0.00084020597664651, + "loss": 0.83012277, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.31323242, + "step": 1476, + "time_per_iteration": 2.7725043296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_mlp": 1.05232406, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.06074887859321084, + "language_loss": 0.83907133, + "learning_rate": 0.0008399776031204111, + "loss": 0.84990764, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.31274414, + "step": 1477, + "time_per_iteration": 2.7300467491149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092258, + "balance_loss_mlp": 1.06081057, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.05838491012274946, + "language_loss": 0.80185568, + "learning_rate": 0.0008397490976033009, + "loss": 0.81277823, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.31420898, + "step": 1478, + "time_per_iteration": 2.650667905807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080543, + "balance_loss_mlp": 1.062042, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.03640521186287318, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78960192, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.18457031, + "step": 1479, + "time_per_iteration": 4.764774322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07654858, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.05702144306517339, + "language_loss": 0.85150903, + "learning_rate": 0.0008392916909509525, + "loss": 0.86259496, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.3203125, + "step": 1480, + "time_per_iteration": 3.0437960624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_mlp": 1.07289815, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.06780557774925215, + "language_loss": 0.84802043, + "learning_rate": 0.0008390627899932954, + "loss": 0.85906273, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.31298828, + "step": 1481, + "time_per_iteration": 2.596781015396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_mlp": 1.0693903, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.07875184362779108, + "language_loss": 0.88996881, + "learning_rate": 0.000838833757399789, + "loss": 0.90097642, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.31347656, + "step": 1482, + "time_per_iteration": 2.94795560836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.05274367, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.07597770471398792, + "language_loss": 0.80484587, + "learning_rate": 0.0008386045932593515, + "loss": 0.81568611, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.3125, + "step": 1483, + "time_per_iteration": 2.6795289516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079675, + "balance_loss_mlp": 1.0484184, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.05859914190414705, + "language_loss": 0.86136287, + "learning_rate": 0.0008383752976609525, + "loss": 0.8721596, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.31225586, + "step": 1484, + "time_per_iteration": 2.900468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_mlp": 1.04878783, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.0559282187978278, + "language_loss": 0.80215633, + "learning_rate": 0.0008381458706936123, + "loss": 0.81296104, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.31665039, + "step": 1485, + "time_per_iteration": 2.6815216541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.05031872, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.06658109550051822, + "language_loss": 0.87213105, + "learning_rate": 0.0008379163124464025, + "loss": 0.88295019, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.31567383, + "step": 1486, + "time_per_iteration": 2.7246947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098145, + "balance_loss_mlp": 1.06572032, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.06266105362217729, + "language_loss": 0.76595891, + "learning_rate": 0.0008376866230084452, + "loss": 0.77694035, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.32421875, + "step": 1487, + "time_per_iteration": 2.8626444339752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_mlp": 1.07006407, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.07368717199594518, + "language_loss": 0.86109662, + "learning_rate": 0.000837456802468914, + "loss": 0.87212193, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.32470703, + "step": 1488, + "time_per_iteration": 2.5964457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109506, + "balance_loss_mlp": 1.07736683, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.0834333673185767, + "language_loss": 0.85148358, + "learning_rate": 0.0008372268509170331, + "loss": 0.86257863, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.32128906, + "step": 1489, + "time_per_iteration": 2.690129518508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109667, + "balance_loss_mlp": 1.06500769, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.06354137393554884, + "language_loss": 0.84668255, + "learning_rate": 0.0008369967684420779, + "loss": 0.85764927, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.31640625, + "step": 1490, + "time_per_iteration": 2.71195912361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_mlp": 1.0523901, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.054809792311278624, + "language_loss": 0.84395373, + "learning_rate": 0.0008367665551333736, + "loss": 0.85479403, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.31616211, + "step": 1491, + "time_per_iteration": 2.604795217514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05223465, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.06594588712207736, + "language_loss": 0.85254663, + "learning_rate": 0.0008365362110802977, + "loss": 0.86338341, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.31420898, + "step": 1492, + "time_per_iteration": 2.8853299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_mlp": 1.05619645, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.057648204576232445, + "language_loss": 0.82509673, + "learning_rate": 0.0008363057363722773, + "loss": 0.83596557, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.30664062, + "step": 1493, + "time_per_iteration": 2.8410117626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088416, + "balance_loss_mlp": 1.05916238, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.06315135639172008, + "language_loss": 0.8381595, + "learning_rate": 0.0008360751310987906, + "loss": 0.84904373, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.29199219, + "step": 1494, + "time_per_iteration": 2.6032519340515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_mlp": 1.05821633, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.0504042487563093, + "language_loss": 0.85491359, + "learning_rate": 0.0008358443953493666, + "loss": 0.865798, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.30175781, + "step": 1495, + "time_per_iteration": 2.859473943710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095118, + "balance_loss_mlp": 1.06586444, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.05765908021852543, + "language_loss": 0.87930727, + "learning_rate": 0.0008356135292135851, + "loss": 0.89025843, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.29223633, + "step": 1496, + "time_per_iteration": 2.5534088611602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092831, + "balance_loss_mlp": 1.06357718, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06886872222290924, + "language_loss": 0.91869086, + "learning_rate": 0.0008353825327810758, + "loss": 0.92961913, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.29223633, + "step": 1497, + "time_per_iteration": 2.4516804218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.0700376, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.06787386534843613, + "language_loss": 0.81638563, + "learning_rate": 0.00083515140614152, + "loss": 0.8273809, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.29467773, + "step": 1498, + "time_per_iteration": 2.6799356937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_mlp": 1.07136989, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.07094138317708479, + "language_loss": 0.861467, + "learning_rate": 0.0008349201493846485, + "loss": 0.87247133, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.2902832, + "step": 1499, + "time_per_iteration": 2.6408841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_mlp": 1.07190013, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.05864167405563355, + "language_loss": 0.88756049, + "learning_rate": 0.0008346887626002432, + "loss": 0.89857149, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.29174805, + "step": 1500, + "time_per_iteration": 2.527707099914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_mlp": 1.07277215, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.05528939811548228, + "language_loss": 0.8596012, + "learning_rate": 0.000834457245878137, + "loss": 0.87062287, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.29345703, + "step": 1501, + "time_per_iteration": 2.6287105083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097625, + "balance_loss_mlp": 1.0678941, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05829487367290223, + "language_loss": 0.81370407, + "learning_rate": 0.000834225599308212, + "loss": 0.82468033, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.296875, + "step": 1502, + "time_per_iteration": 3.2405459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097665, + "balance_loss_mlp": 1.06762409, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.0632270740356206, + "language_loss": 0.85299563, + "learning_rate": 0.0008339938229804016, + "loss": 0.86397231, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.30029297, + "step": 1503, + "time_per_iteration": 2.736917495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238462, + "balance_loss_mlp": 1.22091448, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.0713987899259734, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76673281, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17578125, + "step": 1504, + "time_per_iteration": 4.942230701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_mlp": 1.0553329, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06317842242163065, + "language_loss": 0.83872586, + "learning_rate": 0.0008335298814111094, + "loss": 0.84958482, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.30517578, + "step": 1505, + "time_per_iteration": 2.552032232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_mlp": 1.05138254, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.05888591645587949, + "language_loss": 0.87955916, + "learning_rate": 0.0008332977163497455, + "loss": 0.89038765, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.31445312, + "step": 1506, + "time_per_iteration": 2.792531728744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080802, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.058262801056698586, + "language_loss": 0.83412617, + "learning_rate": 0.0008330654218907325, + "loss": 0.84493423, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.31616211, + "step": 1507, + "time_per_iteration": 2.67161226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082791, + "balance_loss_mlp": 1.05151033, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.053562219876337476, + "language_loss": 0.8135345, + "learning_rate": 0.0008328329981242548, + "loss": 0.8243624, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3125, + "step": 1508, + "time_per_iteration": 2.8886146545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082272, + "balance_loss_mlp": 1.05006218, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.059525688681207785, + "language_loss": 0.87796283, + "learning_rate": 0.0008326004451405475, + "loss": 0.88878554, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.32202148, + "step": 1509, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081166, + "balance_loss_mlp": 1.04919386, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.06566805569484924, + "language_loss": 0.82636976, + "learning_rate": 0.0008323677630298957, + "loss": 0.83718145, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.31958008, + "step": 1510, + "time_per_iteration": 2.5723018646240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.0500108, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.0587639353811087, + "language_loss": 0.84588593, + "learning_rate": 0.0008321349518826345, + "loss": 0.85671222, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.32617188, + "step": 1511, + "time_per_iteration": 2.7943453788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085904, + "balance_loss_mlp": 1.05417013, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07149106056529789, + "language_loss": 0.94572604, + "learning_rate": 0.0008319020117891491, + "loss": 0.95658505, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.31713867, + "step": 1512, + "time_per_iteration": 2.6216046810150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_mlp": 1.05095613, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.062137158428294176, + "language_loss": 0.87139338, + "learning_rate": 0.0008316689428398751, + "loss": 0.88222551, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.32250977, + "step": 1513, + "time_per_iteration": 2.7016332149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.05217493, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.048438835392173675, + "language_loss": 0.88380623, + "learning_rate": 0.0008314357451252979, + "loss": 0.89463598, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.30761719, + "step": 1514, + "time_per_iteration": 2.7707033157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.05329311, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.17247024929444854, + "language_loss": 0.87881547, + "learning_rate": 0.0008312024187359527, + "loss": 0.88966405, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.31542969, + "step": 1515, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_mlp": 1.04083025, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.05532389066983382, + "language_loss": 0.86925149, + "learning_rate": 0.000830968963762425, + "loss": 0.8799662, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.3059082, + "step": 1516, + "time_per_iteration": 3.024911403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.03955793, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.06371457252332635, + "language_loss": 0.83926201, + "learning_rate": 0.0008307353802953497, + "loss": 0.84996927, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.3112793, + "step": 1517, + "time_per_iteration": 2.6853716373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072896, + "balance_loss_mlp": 1.04202044, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04882989118503786, + "language_loss": 0.86122108, + "learning_rate": 0.0008305016684254125, + "loss": 0.87195003, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.30859375, + "step": 1518, + "time_per_iteration": 2.799062728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_mlp": 1.04589891, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.06769299348115199, + "language_loss": 0.86794329, + "learning_rate": 0.0008302678282433479, + "loss": 0.87871796, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.31542969, + "step": 1519, + "time_per_iteration": 2.607813835144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.0473547, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.06836141022194388, + "language_loss": 0.84857148, + "learning_rate": 0.0008300338598399411, + "loss": 0.85936522, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.32006836, + "step": 1520, + "time_per_iteration": 2.6339783668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079776, + "balance_loss_mlp": 1.04677844, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.07756319993269217, + "language_loss": 0.94405806, + "learning_rate": 0.0008297997633060263, + "loss": 0.9548558, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.33007812, + "step": 1521, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_mlp": 1.03991103, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.05829817081366362, + "language_loss": 0.85078239, + "learning_rate": 0.0008295655387324883, + "loss": 0.86150956, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.328125, + "step": 1522, + "time_per_iteration": 2.8296775817871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072427, + "balance_loss_mlp": 1.04031241, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.07682732219120929, + "language_loss": 0.8501184, + "learning_rate": 0.0008293311862102609, + "loss": 0.8608427, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.32104492, + "step": 1523, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.044366, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0685602534850527, + "language_loss": 0.88674849, + "learning_rate": 0.0008290967058303275, + "loss": 0.89752042, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.32836914, + "step": 1524, + "time_per_iteration": 2.47611403465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04138136, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.06274350285183052, + "language_loss": 0.86149156, + "learning_rate": 0.0008288620976837219, + "loss": 0.87222481, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.31933594, + "step": 1525, + "time_per_iteration": 2.497141122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076595, + "balance_loss_mlp": 1.04409802, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.056882926132582716, + "language_loss": 0.82547259, + "learning_rate": 0.000828627361861527, + "loss": 0.8362385, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.32495117, + "step": 1526, + "time_per_iteration": 2.567631959915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.04157782, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.06286177552115993, + "language_loss": 0.84273493, + "learning_rate": 0.0008283924984548752, + "loss": 0.85347635, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.32568359, + "step": 1527, + "time_per_iteration": 2.8300318717956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_mlp": 1.04270601, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05246647038375997, + "language_loss": 0.84726572, + "learning_rate": 0.0008281575075549485, + "loss": 0.85802233, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.32958984, + "step": 1528, + "time_per_iteration": 2.574363946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_mlp": 1.12400758, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.05743835109314035, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78497207, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.20507812, + "step": 1529, + "time_per_iteration": 4.712693452835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085379, + "balance_loss_mlp": 1.05316901, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06778682509264199, + "language_loss": 0.90275097, + "learning_rate": 0.0008276871436402469, + "loss": 0.9136048, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.32202148, + "step": 1530, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098938, + "balance_loss_mlp": 1.06801534, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05712547612295055, + "language_loss": 0.87684029, + "learning_rate": 0.000827451770808083, + "loss": 0.88782966, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.30908203, + "step": 1531, + "time_per_iteration": 2.6601221561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_mlp": 1.06921971, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.06660356736231628, + "language_loss": 0.82939392, + "learning_rate": 0.0008272162708478674, + "loss": 0.84040606, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.31982422, + "step": 1532, + "time_per_iteration": 2.5689916610717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093792, + "balance_loss_mlp": 1.06234503, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.09954158315547566, + "language_loss": 0.86026615, + "learning_rate": 0.000826980643851029, + "loss": 0.87120402, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.31420898, + "step": 1533, + "time_per_iteration": 2.668490409851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_mlp": 1.06560588, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06068587162994625, + "language_loss": 0.84473491, + "learning_rate": 0.0008267448899090464, + "loss": 0.85570371, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.3125, + "step": 1534, + "time_per_iteration": 2.5667166709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_mlp": 1.08053756, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.07629507960375684, + "language_loss": 0.80660546, + "learning_rate": 0.0008265090091134473, + "loss": 0.81771713, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.3059082, + "step": 1535, + "time_per_iteration": 2.8708250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108767, + "balance_loss_mlp": 1.07793915, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.06117244877185189, + "language_loss": 0.80140841, + "learning_rate": 0.0008262730015558088, + "loss": 0.81249607, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.30786133, + "step": 1536, + "time_per_iteration": 2.872954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.06960511, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.058742702923310866, + "language_loss": 0.82196116, + "learning_rate": 0.0008260368673277574, + "loss": 0.8329612, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.3034668, + "step": 1537, + "time_per_iteration": 3.1321218013763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_mlp": 1.06963336, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.0781542924594719, + "language_loss": 0.83699298, + "learning_rate": 0.0008258006065209682, + "loss": 0.84798855, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.29882812, + "step": 1538, + "time_per_iteration": 2.7713711261749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108634, + "balance_loss_mlp": 1.0791415, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.060396297474130736, + "language_loss": 0.80198979, + "learning_rate": 0.0008255642192271657, + "loss": 0.81307614, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.29443359, + "step": 1539, + "time_per_iteration": 2.770426034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_mlp": 1.07525003, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.061957869610313854, + "language_loss": 0.8370012, + "learning_rate": 0.0008253277055381241, + "loss": 0.8480469, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.29296875, + "step": 1540, + "time_per_iteration": 2.818236827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101049, + "balance_loss_mlp": 1.07196212, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.0808235318545815, + "language_loss": 0.85973728, + "learning_rate": 0.0008250910655456658, + "loss": 0.8707478, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.29052734, + "step": 1541, + "time_per_iteration": 3.122596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097236, + "balance_loss_mlp": 1.06888783, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06915250684599016, + "language_loss": 0.83763367, + "learning_rate": 0.0008248542993416625, + "loss": 0.84860599, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.28369141, + "step": 1542, + "time_per_iteration": 2.5910961627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093651, + "balance_loss_mlp": 1.06408739, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.05605218699384054, + "language_loss": 0.8378318, + "learning_rate": 0.0008246174070180352, + "loss": 0.84876835, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.29516602, + "step": 1543, + "time_per_iteration": 2.6633899211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.06312323, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.07006000939384768, + "language_loss": 0.83787405, + "learning_rate": 0.0008243803886667537, + "loss": 0.84879309, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.28759766, + "step": 1544, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092222, + "balance_loss_mlp": 1.0623486, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.06063612617340172, + "language_loss": 0.78866625, + "learning_rate": 0.0008241432443798364, + "loss": 0.79958844, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.2980957, + "step": 1545, + "time_per_iteration": 2.830487012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095453, + "balance_loss_mlp": 1.06491208, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05072672460675934, + "language_loss": 0.85210156, + "learning_rate": 0.0008239059742493512, + "loss": 0.86305606, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.30493164, + "step": 1546, + "time_per_iteration": 2.7311577796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096869, + "balance_loss_mlp": 1.06654167, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.06216195389248957, + "language_loss": 0.87149853, + "learning_rate": 0.0008236685783674142, + "loss": 0.88246721, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.30273438, + "step": 1547, + "time_per_iteration": 3.122184991836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195158, + "balance_loss_mlp": 1.17408168, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.0711099730375168, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77416348, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.2109375, + "step": 1548, + "time_per_iteration": 4.884527683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_mlp": 1.08190823, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.0721948840315393, + "language_loss": 0.82155961, + "learning_rate": 0.0008231934097178955, + "loss": 0.83268768, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.30859375, + "step": 1549, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_mlp": 1.06845081, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.06744191732210313, + "language_loss": 0.85654205, + "learning_rate": 0.0008229556371347903, + "loss": 0.86754102, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.31420898, + "step": 1550, + "time_per_iteration": 2.973072052001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_mlp": 1.06530416, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.063776129703287, + "language_loss": 0.79039407, + "learning_rate": 0.0008227177391691874, + "loss": 0.80135703, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.30957031, + "step": 1551, + "time_per_iteration": 3.121493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091, + "balance_loss_mlp": 1.05948138, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.06994546641795159, + "language_loss": 0.89363164, + "learning_rate": 0.0008224797159134463, + "loss": 0.90454161, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.31494141, + "step": 1552, + "time_per_iteration": 2.714345932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_mlp": 1.05272293, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.0687696840960861, + "language_loss": 0.83498526, + "learning_rate": 0.0008222415674599765, + "loss": 0.84583527, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.32275391, + "step": 1553, + "time_per_iteration": 3.0709471702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_mlp": 1.05482578, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05942841135237563, + "language_loss": 0.83069479, + "learning_rate": 0.0008220032939012349, + "loss": 0.84156853, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.32543945, + "step": 1554, + "time_per_iteration": 2.6579041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084574, + "balance_loss_mlp": 1.05069458, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.05066559322117623, + "language_loss": 0.87862611, + "learning_rate": 0.0008217648953297277, + "loss": 0.88947189, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.33886719, + "step": 1555, + "time_per_iteration": 2.854501962661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080889, + "balance_loss_mlp": 1.04836845, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06306800858294438, + "language_loss": 0.78177649, + "learning_rate": 0.0008215263718380095, + "loss": 0.79258537, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.32519531, + "step": 1556, + "time_per_iteration": 2.679813861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_mlp": 1.03988135, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.05857921257987888, + "language_loss": 0.84453404, + "learning_rate": 0.0008212877235186833, + "loss": 0.8552593, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.32641602, + "step": 1557, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.0575211, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03849586533955073, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812063, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.16992188, + "step": 1558, + "time_per_iteration": 4.915595531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073624, + "balance_loss_mlp": 1.04193807, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06731849387550101, + "language_loss": 0.80882478, + "learning_rate": 0.0008208100527678611, + "loss": 0.81956106, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.31665039, + "step": 1559, + "time_per_iteration": 2.584726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04162097, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.07382200765663921, + "language_loss": 0.78279877, + "learning_rate": 0.0008205710305218135, + "loss": 0.79353946, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.32446289, + "step": 1560, + "time_per_iteration": 3.0383710861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074163, + "balance_loss_mlp": 1.04302561, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.058207727477831525, + "language_loss": 0.89512408, + "learning_rate": 0.0008203318838190541, + "loss": 0.90586567, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.31103516, + "step": 1561, + "time_per_iteration": 2.76627516746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077695, + "balance_loss_mlp": 1.04662895, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.06168132254821995, + "language_loss": 0.85111785, + "learning_rate": 0.0008200926127524281, + "loss": 0.86189479, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.31030273, + "step": 1562, + "time_per_iteration": 2.6629600524902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_mlp": 1.04641104, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.05613480590592382, + "language_loss": 0.82944739, + "learning_rate": 0.0008198532174148289, + "loss": 0.84022236, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.31054688, + "step": 1563, + "time_per_iteration": 2.7358763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_mlp": 1.042413, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.031593282863211954, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81745368, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.16796875, + "step": 1564, + "time_per_iteration": 4.9148335456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082495, + "balance_loss_mlp": 1.05264509, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.06408713771925002, + "language_loss": 0.88499033, + "learning_rate": 0.0008193740542985244, + "loss": 0.89581525, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.2980957, + "step": 1565, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.04955089, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.05458149708053591, + "language_loss": 0.86310005, + "learning_rate": 0.0008191342867058467, + "loss": 0.87388408, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.28833008, + "step": 1566, + "time_per_iteration": 2.7972991466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.05708098, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.07332398387540356, + "language_loss": 0.8337127, + "learning_rate": 0.0008188943952142509, + "loss": 0.84458339, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.29931641, + "step": 1567, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_mlp": 1.06203008, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.06528974392408285, + "language_loss": 0.82496703, + "learning_rate": 0.0008186543799168711, + "loss": 0.83587217, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.28491211, + "step": 1568, + "time_per_iteration": 3.1478142738342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090151, + "balance_loss_mlp": 1.06170726, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.05489125757590388, + "language_loss": 0.87973905, + "learning_rate": 0.0008184142409068892, + "loss": 0.89064056, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.28466797, + "step": 1569, + "time_per_iteration": 3.0216779708862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085926, + "balance_loss_mlp": 1.05767381, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.055531787765466835, + "language_loss": 0.86334872, + "learning_rate": 0.000818173978277536, + "loss": 0.87420803, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.2824707, + "step": 1570, + "time_per_iteration": 2.679858922958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092107, + "balance_loss_mlp": 1.06378245, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.07890485552513911, + "language_loss": 0.83764422, + "learning_rate": 0.000817933592122089, + "loss": 0.84856522, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.28344727, + "step": 1571, + "time_per_iteration": 2.7156453132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097909, + "balance_loss_mlp": 1.06936991, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.06172775968750255, + "language_loss": 0.83209121, + "learning_rate": 0.0008176930825338749, + "loss": 0.84307027, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.28564453, + "step": 1572, + "time_per_iteration": 2.6125760078430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092858, + "balance_loss_mlp": 1.06474876, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.07609523017386281, + "language_loss": 0.88406599, + "learning_rate": 0.0008174524496062679, + "loss": 0.8949945, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.28100586, + "step": 1573, + "time_per_iteration": 2.9266738891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093192, + "balance_loss_mlp": 1.06472516, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.061281594343297996, + "language_loss": 0.85176635, + "learning_rate": 0.0008172116934326894, + "loss": 0.86269826, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.28466797, + "step": 1574, + "time_per_iteration": 2.78182315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093702, + "balance_loss_mlp": 1.06499696, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.061003462460527645, + "language_loss": 0.87581599, + "learning_rate": 0.0008169708141066097, + "loss": 0.88675308, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.28686523, + "step": 1575, + "time_per_iteration": 2.579521894454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095615, + "balance_loss_mlp": 1.06631374, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06494361929352876, + "language_loss": 0.90285015, + "learning_rate": 0.0008167298117215465, + "loss": 0.91380632, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.29272461, + "step": 1576, + "time_per_iteration": 2.576373815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109664, + "balance_loss_mlp": 1.06729078, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06029453435911351, + "language_loss": 0.87511861, + "learning_rate": 0.0008164886863710649, + "loss": 0.88608503, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.29296875, + "step": 1577, + "time_per_iteration": 2.913679599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06847095, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.06219192746352704, + "language_loss": 0.86087388, + "learning_rate": 0.0008162474381487783, + "loss": 0.87184995, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.29101562, + "step": 1578, + "time_per_iteration": 3.0120038986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089575, + "balance_loss_mlp": 1.05979693, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.07133259007734825, + "language_loss": 0.84352636, + "learning_rate": 0.0008160060671483475, + "loss": 0.85442215, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.29711914, + "step": 1579, + "time_per_iteration": 2.6448450088500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087505, + "balance_loss_mlp": 1.05729711, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.06969729270721756, + "language_loss": 0.83291966, + "learning_rate": 0.0008157645734634809, + "loss": 0.8437947, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.30200195, + "step": 1580, + "time_per_iteration": 2.623994827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219684, + "balance_loss_mlp": 1.20118308, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.06785469110901753, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78116179, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.18457031, + "step": 1581, + "time_per_iteration": 4.945984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134498, + "balance_loss_mlp": 1.11723626, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.04727039603147748, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74348998, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17285156, + "step": 1582, + "time_per_iteration": 4.907581567764282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094198, + "balance_loss_mlp": 1.06482506, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.06103997784231323, + "language_loss": 0.83613545, + "learning_rate": 0.000815039357240067, + "loss": 0.84707743, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.29345703, + "step": 1583, + "time_per_iteration": 2.6569504737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_mlp": 1.07053173, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.05926881191118497, + "language_loss": 0.85445809, + "learning_rate": 0.0008147973737554952, + "loss": 0.86544669, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.28344727, + "step": 1584, + "time_per_iteration": 2.8048319816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105359, + "balance_loss_mlp": 1.07682085, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.06192456547731419, + "language_loss": 0.85451925, + "learning_rate": 0.000814555268055744, + "loss": 0.86557281, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.28540039, + "step": 1585, + "time_per_iteration": 2.6496644020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.08265996, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.06812003210241727, + "language_loss": 0.87046736, + "learning_rate": 0.0008143130402348073, + "loss": 0.88158417, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.28979492, + "step": 1586, + "time_per_iteration": 2.6643214225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_mlp": 1.07644498, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.055468457342214825, + "language_loss": 0.79345113, + "learning_rate": 0.0008140706903867265, + "loss": 0.80450928, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.29345703, + "step": 1587, + "time_per_iteration": 2.793938159942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095768, + "balance_loss_mlp": 1.06610858, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.06572122415162869, + "language_loss": 0.90151691, + "learning_rate": 0.0008138282186055897, + "loss": 0.91247463, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.29614258, + "step": 1588, + "time_per_iteration": 2.7083215713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.06414866, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.07456080522357873, + "language_loss": 0.82026887, + "learning_rate": 0.0008135856249855331, + "loss": 0.83120513, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.29467773, + "step": 1589, + "time_per_iteration": 2.6640753746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05720115, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06169186885540492, + "language_loss": 0.89804673, + "learning_rate": 0.0008133429096207398, + "loss": 0.90891039, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.29125977, + "step": 1590, + "time_per_iteration": 2.7599587440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180768, + "balance_loss_mlp": 1.16407835, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.058161185258212886, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76493025, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.16699219, + "step": 1591, + "time_per_iteration": 4.928807973861694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092058, + "balance_loss_mlp": 1.06149244, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05378358074526122, + "language_loss": 0.86363673, + "learning_rate": 0.0008128571140339123, + "loss": 0.87455726, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.30517578, + "step": 1592, + "time_per_iteration": 2.6374073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.06182945, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.059608258439458016, + "language_loss": 0.87261879, + "learning_rate": 0.0008126140340004805, + "loss": 0.88355112, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.3137207, + "step": 1593, + "time_per_iteration": 2.5177900791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_mlp": 1.07528496, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.05384575425533411, + "language_loss": 0.82083076, + "learning_rate": 0.0008123708325995172, + "loss": 0.83190024, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.31640625, + "step": 1594, + "time_per_iteration": 3.230646848678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_mlp": 1.07466626, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.05828956025392548, + "language_loss": 0.79435146, + "learning_rate": 0.0008121275099254414, + "loss": 0.80541706, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.31884766, + "step": 1595, + "time_per_iteration": 2.902198553085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_mlp": 1.07000458, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.0810481792888773, + "language_loss": 0.87996, + "learning_rate": 0.0008118840660727194, + "loss": 0.89096785, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.30761719, + "step": 1596, + "time_per_iteration": 2.6448442935943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_mlp": 1.05465174, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.06221817840069264, + "language_loss": 0.87278962, + "learning_rate": 0.0008116405011358644, + "loss": 0.88365012, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.3137207, + "step": 1597, + "time_per_iteration": 3.1513490676879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_mlp": 1.05455184, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05780846158028219, + "language_loss": 0.79670262, + "learning_rate": 0.0008113968152094369, + "loss": 0.80755049, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.30175781, + "step": 1598, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_mlp": 1.05160582, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.05742950260468591, + "language_loss": 0.822034, + "learning_rate": 0.0008111530083880438, + "loss": 0.83285123, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.30078125, + "step": 1599, + "time_per_iteration": 2.9002020359039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.05333805, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.066825138462863, + "language_loss": 0.86253393, + "learning_rate": 0.0008109090807663399, + "loss": 0.87336552, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.29760742, + "step": 1600, + "time_per_iteration": 2.8091297149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078593, + "balance_loss_mlp": 1.04921985, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.05248494232095894, + "language_loss": 0.88362008, + "learning_rate": 0.0008106650324390257, + "loss": 0.89440602, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.29370117, + "step": 1601, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080904, + "balance_loss_mlp": 1.05072021, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06836714374526962, + "language_loss": 0.81128752, + "learning_rate": 0.0008104208635008493, + "loss": 0.82209659, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.30151367, + "step": 1602, + "time_per_iteration": 2.6952836513519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_mlp": 1.05665243, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.06376665529861299, + "language_loss": 0.81538713, + "learning_rate": 0.0008101765740466058, + "loss": 0.82624954, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.29541016, + "step": 1603, + "time_per_iteration": 2.4948389530181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080977, + "balance_loss_mlp": 1.05098414, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.06931980864978393, + "language_loss": 0.84338289, + "learning_rate": 0.0008099321641711364, + "loss": 0.85419261, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.29931641, + "step": 1604, + "time_per_iteration": 2.707308769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093892, + "balance_loss_mlp": 1.06249225, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.060864651717696075, + "language_loss": 0.83160985, + "learning_rate": 0.0008096876339693295, + "loss": 0.84254879, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.3137207, + "step": 1605, + "time_per_iteration": 2.731968402862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094701, + "balance_loss_mlp": 1.06353974, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.06509347225319946, + "language_loss": 0.8101337, + "learning_rate": 0.0008094429835361206, + "loss": 0.8210808, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.3112793, + "step": 1606, + "time_per_iteration": 2.9290759563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05914617, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.057098253953708926, + "language_loss": 0.8565855, + "learning_rate": 0.0008091982129664908, + "loss": 0.86748546, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.30810547, + "step": 1607, + "time_per_iteration": 2.698822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087412, + "balance_loss_mlp": 1.05558348, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.06809183454795278, + "language_loss": 0.82921505, + "learning_rate": 0.0008089533223554687, + "loss": 0.8400892, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.31811523, + "step": 1608, + "time_per_iteration": 2.7226502895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.05116844, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05457453553086006, + "language_loss": 0.85192972, + "learning_rate": 0.0008087083117981294, + "loss": 0.86274683, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.30493164, + "step": 1609, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_mlp": 1.04733825, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.05682891267097286, + "language_loss": 0.87723553, + "learning_rate": 0.0008084631813895943, + "loss": 0.88802552, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.31665039, + "step": 1610, + "time_per_iteration": 2.8217973709106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077424, + "balance_loss_mlp": 1.04538095, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.06653230383850259, + "language_loss": 0.83695799, + "learning_rate": 0.0008082179312250315, + "loss": 0.84773219, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.3203125, + "step": 1611, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.13905036, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.03907624866068961, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81013775, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18847656, + "step": 1612, + "time_per_iteration": 4.846347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142611, + "balance_loss_mlp": 1.12401426, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.03590336133433786, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77771938, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.18554688, + "step": 1613, + "time_per_iteration": 5.076608896255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_mlp": 1.05432057, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06574200684353006, + "language_loss": 0.81847739, + "learning_rate": 0.0008074814631475545, + "loss": 0.829337, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.31616211, + "step": 1614, + "time_per_iteration": 3.354888916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_mlp": 1.05552983, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.058665683967318874, + "language_loss": 0.79078931, + "learning_rate": 0.0008072357349114907, + "loss": 0.80165768, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.31274414, + "step": 1615, + "time_per_iteration": 2.66959810256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085653, + "balance_loss_mlp": 1.05427742, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.07028059658598983, + "language_loss": 0.88604105, + "learning_rate": 0.0008069898873959363, + "loss": 0.89689755, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.31347656, + "step": 1616, + "time_per_iteration": 2.652873992919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081821, + "balance_loss_mlp": 1.04932451, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.0549356144381418, + "language_loss": 0.85724425, + "learning_rate": 0.0008067439206963375, + "loss": 0.86806244, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32495117, + "step": 1617, + "time_per_iteration": 2.651966094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_mlp": 1.04707837, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06196009796144799, + "language_loss": 0.86023569, + "learning_rate": 0.0008064978349081873, + "loss": 0.87101597, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.30908203, + "step": 1618, + "time_per_iteration": 2.9655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_mlp": 1.04403007, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.05286958899784421, + "language_loss": 0.86531937, + "learning_rate": 0.0008062516301270245, + "loss": 0.87608671, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.32714844, + "step": 1619, + "time_per_iteration": 2.6688730716705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.04668832, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.04767982292239376, + "language_loss": 0.88103712, + "learning_rate": 0.0008060053064484343, + "loss": 0.89181346, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.30908203, + "step": 1620, + "time_per_iteration": 2.9296655654907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_mlp": 1.04794526, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.062218975842766755, + "language_loss": 0.85253787, + "learning_rate": 0.0008057588639680482, + "loss": 0.86332226, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.3046875, + "step": 1621, + "time_per_iteration": 2.7567451000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077048, + "balance_loss_mlp": 1.04686427, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06694670244497776, + "language_loss": 0.82797694, + "learning_rate": 0.0008055123027815434, + "loss": 0.83874738, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.30151367, + "step": 1622, + "time_per_iteration": 2.9208602905273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077079, + "balance_loss_mlp": 1.04610825, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.1782498685509151, + "language_loss": 0.84590065, + "learning_rate": 0.0008052656229846436, + "loss": 0.85667145, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.30932617, + "step": 1623, + "time_per_iteration": 2.7155866622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073968, + "balance_loss_mlp": 1.04328322, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.060959339396114136, + "language_loss": 0.90353578, + "learning_rate": 0.0008050188246731182, + "loss": 0.91427553, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.30664062, + "step": 1624, + "time_per_iteration": 2.6797330379486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076074, + "balance_loss_mlp": 1.04412627, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.055606567643031936, + "language_loss": 0.81689882, + "learning_rate": 0.0008047719079427834, + "loss": 0.82765961, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.31933594, + "step": 1625, + "time_per_iteration": 3.0065042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_mlp": 1.11031902, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.04475298972307083, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75482148, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.20117188, + "step": 1626, + "time_per_iteration": 4.811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_mlp": 1.04688525, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.07327685166102689, + "language_loss": 0.86126161, + "learning_rate": 0.0008042777196091757, + "loss": 0.87205535, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.32495117, + "step": 1627, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05241048, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.055253724304277024, + "language_loss": 0.81718934, + "learning_rate": 0.0008040304481977643, + "loss": 0.82803679, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.32324219, + "step": 1628, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.0556109, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.07469207399290811, + "language_loss": 0.86699098, + "learning_rate": 0.0008037830587512649, + "loss": 0.87787557, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.32861328, + "step": 1629, + "time_per_iteration": 3.092052459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_mlp": 1.0538609, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.05491200172004239, + "language_loss": 0.78946573, + "learning_rate": 0.0008035355513657224, + "loss": 0.80032265, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.31811523, + "step": 1630, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_mlp": 1.05111051, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.05139869194515267, + "language_loss": 0.92925692, + "learning_rate": 0.0008032879261372279, + "loss": 0.94008344, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.31518555, + "step": 1631, + "time_per_iteration": 2.779520034790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.05868566, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.031013784922197977, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80712551, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.18066406, + "step": 1632, + "time_per_iteration": 5.371822357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_mlp": 1.04828787, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.055553714952817974, + "language_loss": 0.87074977, + "learning_rate": 0.0008027923225359748, + "loss": 0.8815397, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.30688477, + "step": 1633, + "time_per_iteration": 2.6381123065948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_mlp": 1.04797852, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05859649155609266, + "language_loss": 0.88228178, + "learning_rate": 0.0008025443443556267, + "loss": 0.89307147, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.30957031, + "step": 1634, + "time_per_iteration": 2.7031404972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.04785156, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.052081770011180493, + "language_loss": 0.88152099, + "learning_rate": 0.000802296248717147, + "loss": 0.89230251, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.30273438, + "step": 1635, + "time_per_iteration": 2.9598543643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.05080533, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.066530556652877, + "language_loss": 0.78616363, + "learning_rate": 0.0008020480357168554, + "loss": 0.79697067, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.29833984, + "step": 1636, + "time_per_iteration": 2.797565221786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05261683, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.1046412191682548, + "language_loss": 0.87883365, + "learning_rate": 0.0008017997054511165, + "loss": 0.88965666, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.29638672, + "step": 1637, + "time_per_iteration": 2.559032440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078208, + "balance_loss_mlp": 1.04733276, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.05513941849331592, + "language_loss": 0.85624552, + "learning_rate": 0.0008015512580163407, + "loss": 0.86702752, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.30834961, + "step": 1638, + "time_per_iteration": 2.779050827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04363525, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.05557291013478606, + "language_loss": 0.81019449, + "learning_rate": 0.0008013026935089838, + "loss": 0.82094443, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.31323242, + "step": 1639, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04701638, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.06613944709877946, + "language_loss": 0.8358075, + "learning_rate": 0.0008010540120255472, + "loss": 0.84657711, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.29882812, + "step": 1640, + "time_per_iteration": 2.651386260986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077047, + "balance_loss_mlp": 1.0463388, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.07317243700129339, + "language_loss": 0.86339968, + "learning_rate": 0.0008008052136625774, + "loss": 0.87417012, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.30688477, + "step": 1641, + "time_per_iteration": 2.7859702110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077184, + "balance_loss_mlp": 1.04642797, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05078324108170858, + "language_loss": 0.86915755, + "learning_rate": 0.0008005562985166666, + "loss": 0.87992936, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.30712891, + "step": 1642, + "time_per_iteration": 2.770359516143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04775047, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.048579646337906, + "language_loss": 0.85256124, + "learning_rate": 0.0008003072666844524, + "loss": 0.86334682, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.30761719, + "step": 1643, + "time_per_iteration": 2.6892380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081754, + "balance_loss_mlp": 1.05076003, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.06943709441331726, + "language_loss": 0.82542813, + "learning_rate": 0.0008000581182626173, + "loss": 0.83624566, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.30981445, + "step": 1644, + "time_per_iteration": 2.550408124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05496669, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.05777646040930187, + "language_loss": 0.86256635, + "learning_rate": 0.0007998088533478894, + "loss": 0.87341708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.30053711, + "step": 1645, + "time_per_iteration": 2.646522283554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081027, + "balance_loss_mlp": 1.05019915, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07748310873558778, + "language_loss": 0.84388101, + "learning_rate": 0.000799559472037042, + "loss": 0.85469127, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.30786133, + "step": 1646, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081594, + "balance_loss_mlp": 1.05112433, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.0644603274178606, + "language_loss": 0.87469906, + "learning_rate": 0.0007993099744268932, + "loss": 0.88551497, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.30419922, + "step": 1647, + "time_per_iteration": 2.905468225479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074972, + "balance_loss_mlp": 1.04414475, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.06139744482341488, + "language_loss": 0.87846816, + "learning_rate": 0.000799060360614307, + "loss": 0.88921791, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.30786133, + "step": 1648, + "time_per_iteration": 2.6811182498931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083311, + "balance_loss_mlp": 1.05250716, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05150264807756507, + "language_loss": 0.83281147, + "learning_rate": 0.0007988106306961917, + "loss": 0.84364462, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.30761719, + "step": 1649, + "time_per_iteration": 3.132918119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_mlp": 1.04840076, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.0787550229152594, + "language_loss": 0.84213352, + "learning_rate": 0.0007985607847695014, + "loss": 0.85291457, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.29663086, + "step": 1650, + "time_per_iteration": 2.690056085586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04784608, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.0566788479410698, + "language_loss": 0.82883936, + "learning_rate": 0.0007983108229312345, + "loss": 0.83962488, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.30664062, + "step": 1651, + "time_per_iteration": 2.918217182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.04679036, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0674507609019882, + "language_loss": 0.86496019, + "learning_rate": 0.0007980607452784351, + "loss": 0.87573761, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.30908203, + "step": 1652, + "time_per_iteration": 2.5508391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_mlp": 1.052019, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.06063063486045483, + "language_loss": 0.90349394, + "learning_rate": 0.0007978105519081919, + "loss": 0.91431332, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.29858398, + "step": 1653, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079168, + "balance_loss_mlp": 1.04910302, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.0738675373878511, + "language_loss": 0.87538201, + "learning_rate": 0.0007975602429176385, + "loss": 0.88617373, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.30004883, + "step": 1654, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05356312, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.051475836139836105, + "language_loss": 0.81585073, + "learning_rate": 0.0007973098184039536, + "loss": 0.82669556, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.30883789, + "step": 1655, + "time_per_iteration": 2.66395902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_mlp": 1.05291927, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.059751712008043044, + "language_loss": 0.86801946, + "learning_rate": 0.0007970592784643602, + "loss": 0.87885141, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.30224609, + "step": 1656, + "time_per_iteration": 2.9186086654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_mlp": 1.05855238, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.07875703275612048, + "language_loss": 0.85285407, + "learning_rate": 0.0007968086231961272, + "loss": 0.86373335, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.29321289, + "step": 1657, + "time_per_iteration": 2.6505343914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089245, + "balance_loss_mlp": 1.05941832, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08653253817480935, + "language_loss": 0.8381049, + "learning_rate": 0.0007965578526965671, + "loss": 0.84899735, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.29785156, + "step": 1658, + "time_per_iteration": 2.5884180068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089397, + "balance_loss_mlp": 1.05995274, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.05523051502884026, + "language_loss": 0.86312473, + "learning_rate": 0.0007963069670630377, + "loss": 0.87401861, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.29394531, + "step": 1659, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089678, + "balance_loss_mlp": 1.05997133, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.06732717892338919, + "language_loss": 0.8810066, + "learning_rate": 0.0007960559663929416, + "loss": 0.89190334, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.29663086, + "step": 1660, + "time_per_iteration": 2.6370737552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.06633985, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.0532651376254825, + "language_loss": 0.87495023, + "learning_rate": 0.0007958048507837259, + "loss": 0.88591546, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.30151367, + "step": 1661, + "time_per_iteration": 2.942779779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_mlp": 1.06316066, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.07710421129836972, + "language_loss": 0.87092876, + "learning_rate": 0.0007955536203328822, + "loss": 0.8818627, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.30175781, + "step": 1662, + "time_per_iteration": 2.8991520404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100595, + "balance_loss_mlp": 1.07072091, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.05380031942726595, + "language_loss": 0.8344577, + "learning_rate": 0.0007953022751379469, + "loss": 0.84546363, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.2980957, + "step": 1663, + "time_per_iteration": 2.795117139816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_mlp": 1.07239294, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.0657811186180598, + "language_loss": 0.81884921, + "learning_rate": 0.000795050815296501, + "loss": 0.82987475, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.30151367, + "step": 1664, + "time_per_iteration": 2.969935894012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099283, + "balance_loss_mlp": 1.06890798, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.058736361347452894, + "language_loss": 0.93026185, + "learning_rate": 0.0007947992409061695, + "loss": 0.94125462, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.30322266, + "step": 1665, + "time_per_iteration": 2.585144281387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06182027, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05523611327933496, + "language_loss": 0.8654207, + "learning_rate": 0.0007945475520646226, + "loss": 0.87634689, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.30761719, + "step": 1666, + "time_per_iteration": 2.9349849224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092223, + "balance_loss_mlp": 1.06249237, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.05521997897435197, + "language_loss": 0.84546125, + "learning_rate": 0.0007942957488695743, + "loss": 0.85638344, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.296875, + "step": 1667, + "time_per_iteration": 2.6538572311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.0539664, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.05331163349230756, + "language_loss": 0.81038171, + "learning_rate": 0.0007940438314187833, + "loss": 0.82121915, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.29760742, + "step": 1668, + "time_per_iteration": 3.009927988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108075, + "balance_loss_mlp": 1.05016077, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.06087879277496283, + "language_loss": 0.80221838, + "learning_rate": 0.0007937917998100529, + "loss": 0.81302583, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.30541992, + "step": 1669, + "time_per_iteration": 2.5703017711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072786, + "balance_loss_mlp": 1.0426501, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.07064769089672658, + "language_loss": 0.78527176, + "learning_rate": 0.0007935396541412302, + "loss": 0.79599965, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.30102539, + "step": 1670, + "time_per_iteration": 2.625499725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081422, + "balance_loss_mlp": 1.05099988, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.0720065018777928, + "language_loss": 0.8546167, + "learning_rate": 0.0007932873945102068, + "loss": 0.86543095, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.30395508, + "step": 1671, + "time_per_iteration": 2.6188762187957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074685, + "balance_loss_mlp": 1.05713737, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.027722134190714592, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76836461, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.17578125, + "step": 1672, + "time_per_iteration": 4.9278037548065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081072, + "balance_loss_mlp": 1.05057812, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.053011814820585035, + "language_loss": 0.86121267, + "learning_rate": 0.0007927825337533461, + "loss": 0.87202334, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.3046875, + "step": 1673, + "time_per_iteration": 2.6787123680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075926, + "balance_loss_mlp": 1.0452652, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06681709765508774, + "language_loss": 0.84770656, + "learning_rate": 0.0007925299328235131, + "loss": 0.85846579, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.30615234, + "step": 1674, + "time_per_iteration": 2.638434410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080022, + "balance_loss_mlp": 1.04890847, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.06949369164102485, + "language_loss": 0.84795958, + "learning_rate": 0.000792277218323488, + "loss": 0.85875976, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.31103516, + "step": 1675, + "time_per_iteration": 2.5852880477905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04653537, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.06490362841252771, + "language_loss": 0.84737194, + "learning_rate": 0.0007920243903513833, + "loss": 0.85814989, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.31225586, + "step": 1676, + "time_per_iteration": 2.558058261871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_mlp": 1.0523684, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.0667244817356676, + "language_loss": 0.83645618, + "learning_rate": 0.0007917714490053556, + "loss": 0.84729266, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.3125, + "step": 1677, + "time_per_iteration": 2.6619315147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.05046487, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.05833648566333407, + "language_loss": 0.85744321, + "learning_rate": 0.0007915183943836055, + "loss": 0.8682673, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.31933594, + "step": 1678, + "time_per_iteration": 2.8658525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04729617, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.06725353636254193, + "language_loss": 0.84315777, + "learning_rate": 0.0007912652265843773, + "loss": 0.8539505, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.31958008, + "step": 1679, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_mlp": 1.05019951, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.062193961969532426, + "language_loss": 0.81564045, + "learning_rate": 0.0007910119457059597, + "loss": 0.82647079, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.32836914, + "step": 1680, + "time_per_iteration": 2.6963257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05333161, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.0682304205879652, + "language_loss": 0.80304003, + "learning_rate": 0.0007907585518466849, + "loss": 0.81389421, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.32080078, + "step": 1681, + "time_per_iteration": 2.969540596008301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081665, + "balance_loss_mlp": 1.05026531, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.06175447283803796, + "language_loss": 0.89361274, + "learning_rate": 0.000790505045104929, + "loss": 0.90442938, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.3137207, + "step": 1682, + "time_per_iteration": 2.5148813724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_mlp": 1.05108356, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.061424377243362256, + "language_loss": 0.87097234, + "learning_rate": 0.0007902514255791125, + "loss": 0.88180125, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.31787109, + "step": 1683, + "time_per_iteration": 2.7773754596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078151, + "balance_loss_mlp": 1.04696608, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.06766194852988328, + "language_loss": 0.87911332, + "learning_rate": 0.0007899976933676986, + "loss": 0.88989484, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.31176758, + "step": 1684, + "time_per_iteration": 2.9700520038604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_mlp": 1.04589295, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.061649412189834635, + "language_loss": 0.87300712, + "learning_rate": 0.0007897438485691955, + "loss": 0.88378721, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.32104492, + "step": 1685, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04483223, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.06379930216662907, + "language_loss": 0.823452, + "learning_rate": 0.0007894898912821542, + "loss": 0.83422434, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.32397461, + "step": 1686, + "time_per_iteration": 2.5478906631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071757, + "balance_loss_mlp": 1.03978539, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.05321818652056826, + "language_loss": 0.86522776, + "learning_rate": 0.0007892358216051695, + "loss": 0.87594533, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.31958008, + "step": 1687, + "time_per_iteration": 2.735633134841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075777, + "balance_loss_mlp": 1.04251742, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.0608133700269358, + "language_loss": 0.91922832, + "learning_rate": 0.0007889816396368803, + "loss": 0.92998612, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.33276367, + "step": 1688, + "time_per_iteration": 2.6234939098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077878, + "balance_loss_mlp": 1.04497576, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.0630363811740232, + "language_loss": 0.85370868, + "learning_rate": 0.0007887273454759687, + "loss": 0.86448747, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.32910156, + "step": 1689, + "time_per_iteration": 2.4698379039764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074583, + "balance_loss_mlp": 1.04184794, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.06604183912716106, + "language_loss": 0.82445431, + "learning_rate": 0.0007884729392211603, + "loss": 0.83520007, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.32739258, + "step": 1690, + "time_per_iteration": 2.6488864421844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.04920113, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06849578130600678, + "language_loss": 0.85280114, + "learning_rate": 0.0007882184209712245, + "loss": 0.86361718, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.32397461, + "step": 1691, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080531, + "balance_loss_mlp": 1.04874992, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.06225581397596747, + "language_loss": 0.8573736, + "learning_rate": 0.000787963790824974, + "loss": 0.8681789, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.31762695, + "step": 1692, + "time_per_iteration": 2.9696617126464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06054115, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.0857009989212748, + "language_loss": 0.89660913, + "learning_rate": 0.0007877090488812651, + "loss": 0.90753233, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.31762695, + "step": 1693, + "time_per_iteration": 2.431861639022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_mlp": 1.05553031, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.07076453254267401, + "language_loss": 0.8368417, + "learning_rate": 0.0007874541952389973, + "loss": 0.84770912, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.31176758, + "step": 1694, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_mlp": 1.05293202, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.060562687008333366, + "language_loss": 0.86582285, + "learning_rate": 0.0007871992299971136, + "loss": 0.87666881, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.31640625, + "step": 1695, + "time_per_iteration": 2.553171396255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_mlp": 1.0608871, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.05969457295977618, + "language_loss": 0.84301764, + "learning_rate": 0.0007869441532546001, + "loss": 0.85394001, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.31323242, + "step": 1696, + "time_per_iteration": 2.752049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.06247652, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05927141137383595, + "language_loss": 0.79686946, + "learning_rate": 0.0007866889651104867, + "loss": 0.80780673, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.31225586, + "step": 1697, + "time_per_iteration": 2.7691686153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109533, + "balance_loss_mlp": 1.06388259, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.0715366482234757, + "language_loss": 0.83218181, + "learning_rate": 0.000786433665663846, + "loss": 0.84313512, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.31420898, + "step": 1698, + "time_per_iteration": 2.717372179031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098821, + "balance_loss_mlp": 1.06816053, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.05645489658390659, + "language_loss": 0.86431837, + "learning_rate": 0.0007861782550137942, + "loss": 0.87530661, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.30615234, + "step": 1699, + "time_per_iteration": 2.9035465717315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_mlp": 1.07394195, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.11170286971508382, + "language_loss": 0.85853553, + "learning_rate": 0.0007859227332594901, + "loss": 0.86957312, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.29785156, + "step": 1700, + "time_per_iteration": 2.9302797317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093978, + "balance_loss_mlp": 1.06508183, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.07200471053268022, + "language_loss": 0.84801477, + "learning_rate": 0.0007856671005001365, + "loss": 0.85895455, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.28881836, + "step": 1701, + "time_per_iteration": 3.1760013103485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090985, + "balance_loss_mlp": 1.06225514, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.07453437515979243, + "language_loss": 0.81870627, + "learning_rate": 0.0007854113568349787, + "loss": 0.82961613, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.28686523, + "step": 1702, + "time_per_iteration": 3.1038365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_mlp": 1.05770779, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.07528598974040544, + "language_loss": 0.80317354, + "learning_rate": 0.0007851555023633052, + "loss": 0.81405228, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.30102539, + "step": 1703, + "time_per_iteration": 2.847515106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.0558784, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.08040178147570827, + "language_loss": 0.82301831, + "learning_rate": 0.0007848995371844474, + "loss": 0.83387053, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.29296875, + "step": 1704, + "time_per_iteration": 2.5442426204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098029, + "balance_loss_mlp": 1.06872725, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06101842979524802, + "language_loss": 0.80441558, + "learning_rate": 0.0007846434613977801, + "loss": 0.81539583, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.29296875, + "step": 1705, + "time_per_iteration": 2.5023465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091561, + "balance_loss_mlp": 1.06242633, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.07007502801083235, + "language_loss": 0.78621399, + "learning_rate": 0.0007843872751027203, + "loss": 0.79712963, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.29125977, + "step": 1706, + "time_per_iteration": 2.790001392364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094895, + "balance_loss_mlp": 1.06549811, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.05836443006497643, + "language_loss": 0.87259293, + "learning_rate": 0.0007841309783987287, + "loss": 0.88354194, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.29345703, + "step": 1707, + "time_per_iteration": 2.7478153705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097713, + "balance_loss_mlp": 1.0684588, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.05888352709782848, + "language_loss": 0.89055538, + "learning_rate": 0.0007838745713853084, + "loss": 0.90153247, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.29199219, + "step": 1708, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088275, + "balance_loss_mlp": 1.05925906, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.06397878577513526, + "language_loss": 0.8386358, + "learning_rate": 0.0007836180541620053, + "loss": 0.8495186, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.29003906, + "step": 1709, + "time_per_iteration": 2.7023067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_mlp": 1.06191421, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.05521592697878337, + "language_loss": 0.86435962, + "learning_rate": 0.0007833614268284082, + "loss": 0.87527102, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.29199219, + "step": 1710, + "time_per_iteration": 2.538080930709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_mlp": 1.0721513, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.029520146980468998, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75200427, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.18457031, + "step": 1711, + "time_per_iteration": 4.909448862075806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05965161, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.07803051984240059, + "language_loss": 0.78501904, + "learning_rate": 0.0007828478422289016, + "loss": 0.79591095, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.29492188, + "step": 1712, + "time_per_iteration": 2.5883195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092173, + "balance_loss_mlp": 1.06210816, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05953292046858541, + "language_loss": 0.88987601, + "learning_rate": 0.0007825908851623833, + "loss": 0.90079772, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.30004883, + "step": 1713, + "time_per_iteration": 2.7441718578338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089127, + "balance_loss_mlp": 1.05973005, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06609176393308323, + "language_loss": 0.8478905, + "learning_rate": 0.0007823338183843533, + "loss": 0.85878181, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.29394531, + "step": 1714, + "time_per_iteration": 2.6771602630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.06291747, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.10875146541446083, + "language_loss": 0.80569458, + "learning_rate": 0.0007820766419946141, + "loss": 0.81661701, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.29321289, + "step": 1715, + "time_per_iteration": 3.3068225383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_mlp": 1.07052732, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.03503617860008252, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760461, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.17480469, + "step": 1716, + "time_per_iteration": 5.048320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_mlp": 1.06201911, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.06576145610663801, + "language_loss": 0.76379126, + "learning_rate": 0.0007815619607794288, + "loss": 0.77470231, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.29052734, + "step": 1717, + "time_per_iteration": 2.6151187419891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094733, + "balance_loss_mlp": 1.06440604, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.08930544150493325, + "language_loss": 0.82491159, + "learning_rate": 0.0007813044561538001, + "loss": 0.835859, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.30273438, + "step": 1718, + "time_per_iteration": 3.1329195499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089209, + "balance_loss_mlp": 1.05928707, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.06440748712139703, + "language_loss": 0.88832355, + "learning_rate": 0.0007810468423160958, + "loss": 0.8992157, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.29882812, + "step": 1719, + "time_per_iteration": 2.8785343170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_mlp": 1.06195092, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.05842798757545397, + "language_loss": 0.81825691, + "learning_rate": 0.0007807891193663306, + "loss": 0.82917207, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.29492188, + "step": 1720, + "time_per_iteration": 2.775949478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.05956948, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.1056737351826848, + "language_loss": 0.82154363, + "learning_rate": 0.0007805312874045614, + "loss": 0.83243477, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.29516602, + "step": 1721, + "time_per_iteration": 2.528573513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.06054103, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.06879892565652022, + "language_loss": 0.86894739, + "learning_rate": 0.0007802733465308874, + "loss": 0.87984586, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.29272461, + "step": 1722, + "time_per_iteration": 2.4575133323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.05811512, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.06801648197756033, + "language_loss": 0.84311831, + "learning_rate": 0.0007800152968454501, + "loss": 0.85398912, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.28930664, + "step": 1723, + "time_per_iteration": 2.729114294052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_mlp": 1.06300533, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.049597969001903774, + "language_loss": 0.90648681, + "learning_rate": 0.0007797571384484334, + "loss": 0.91740465, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.28759766, + "step": 1724, + "time_per_iteration": 2.8813512325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_mlp": 1.05463219, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.060917196813517045, + "language_loss": 0.91917408, + "learning_rate": 0.0007794988714400633, + "loss": 0.9300158, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.29516602, + "step": 1725, + "time_per_iteration": 2.6094837188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_mlp": 1.05896294, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.06883363868640566, + "language_loss": 0.85331756, + "learning_rate": 0.0007792404959206079, + "loss": 0.86420023, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.29272461, + "step": 1726, + "time_per_iteration": 2.4982993602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_mlp": 1.05396366, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.0595205364190525, + "language_loss": 0.81498575, + "learning_rate": 0.0007789820119903774, + "loss": 0.82581604, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.29052734, + "step": 1727, + "time_per_iteration": 2.9797775745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04043114, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.028746370774938412, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552454, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.19335938, + "step": 1728, + "time_per_iteration": 4.892562627792358 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_mlp": 1.05982828, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.10868743625457102, + "language_loss": 0.83712173, + "learning_rate": 0.0007784647192990428, + "loss": 0.84802401, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.3034668, + "step": 1729, + "time_per_iteration": 2.721163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093021, + "balance_loss_mlp": 1.06283677, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.06834187729314575, + "language_loss": 0.80591226, + "learning_rate": 0.0007782059107387696, + "loss": 0.81684244, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.30151367, + "step": 1730, + "time_per_iteration": 2.8358583450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097893, + "balance_loss_mlp": 1.06768548, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.06518025115488765, + "language_loss": 0.88646144, + "learning_rate": 0.0007779469941693826, + "loss": 0.89744031, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.30175781, + "step": 1731, + "time_per_iteration": 2.8069489002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105874, + "balance_loss_mlp": 1.0744741, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.0738487456517703, + "language_loss": 0.76712036, + "learning_rate": 0.0007776879696914029, + "loss": 0.77817911, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.3137207, + "step": 1732, + "time_per_iteration": 2.8068690299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08479202, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.06155067702851775, + "language_loss": 0.88390094, + "learning_rate": 0.000777428837405392, + "loss": 0.89506716, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.31811523, + "step": 1733, + "time_per_iteration": 2.8412673473358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_mlp": 1.07530773, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.0682339524169846, + "language_loss": 0.86804128, + "learning_rate": 0.0007771695974119544, + "loss": 0.87911332, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.31884766, + "step": 1734, + "time_per_iteration": 2.512354612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_mlp": 1.07159579, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.0845052703087739, + "language_loss": 0.75201118, + "learning_rate": 0.0007769102498117359, + "loss": 0.7630502, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.32299805, + "step": 1735, + "time_per_iteration": 3.107100248336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_mlp": 1.05777764, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.061332510780765306, + "language_loss": 0.79977, + "learning_rate": 0.000776650794705424, + "loss": 0.81067985, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33227539, + "step": 1736, + "time_per_iteration": 3.259875535964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092848, + "balance_loss_mlp": 1.06116199, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.05236613872795896, + "language_loss": 0.82229674, + "learning_rate": 0.0007763912321937483, + "loss": 0.83322519, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.31665039, + "step": 1737, + "time_per_iteration": 2.704059600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088373, + "balance_loss_mlp": 1.05506587, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.07890071498287932, + "language_loss": 0.82297349, + "learning_rate": 0.0007761315623774799, + "loss": 0.83385718, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33325195, + "step": 1738, + "time_per_iteration": 3.399148464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.0574522, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.09967891290955513, + "language_loss": 0.87632757, + "learning_rate": 0.0007758717853574313, + "loss": 0.88722181, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.31958008, + "step": 1739, + "time_per_iteration": 2.772089958190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103829, + "balance_loss_mlp": 1.0729773, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06672668023604937, + "language_loss": 0.90074134, + "learning_rate": 0.0007756119012344571, + "loss": 0.91177964, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.30810547, + "step": 1740, + "time_per_iteration": 2.5482232570648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_mlp": 1.07707, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.07840140242610649, + "language_loss": 0.84438574, + "learning_rate": 0.0007753519101094535, + "loss": 0.85546857, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.31176758, + "step": 1741, + "time_per_iteration": 2.749004602432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_mlp": 1.07173228, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.07002932741488781, + "language_loss": 0.86241812, + "learning_rate": 0.0007750918120833575, + "loss": 0.87343943, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.3034668, + "step": 1742, + "time_per_iteration": 2.600731611251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_mlp": 1.0753479, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.07258867640739639, + "language_loss": 0.87368989, + "learning_rate": 0.0007748316072571485, + "loss": 0.88474762, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.30395508, + "step": 1743, + "time_per_iteration": 2.7698371410369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_mlp": 1.07902408, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.05763877458348602, + "language_loss": 0.79041934, + "learning_rate": 0.0007745712957318467, + "loss": 0.80151671, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.30664062, + "step": 1744, + "time_per_iteration": 2.967310667037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_mlp": 1.07412386, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.052786515694630796, + "language_loss": 0.86410165, + "learning_rate": 0.0007743108776085141, + "loss": 0.87514448, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.30102539, + "step": 1745, + "time_per_iteration": 2.771803855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_mlp": 1.07049131, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.06089020802257528, + "language_loss": 0.82798052, + "learning_rate": 0.0007740503529882543, + "loss": 0.83900565, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.32006836, + "step": 1746, + "time_per_iteration": 2.805392026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095402, + "balance_loss_mlp": 1.064551, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.0569869068698716, + "language_loss": 0.90718448, + "learning_rate": 0.0007737897219722114, + "loss": 0.9181385, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.30810547, + "step": 1747, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.05970204, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.07943976371979472, + "language_loss": 0.80688596, + "learning_rate": 0.0007735289846615716, + "loss": 0.81779456, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.31152344, + "step": 1748, + "time_per_iteration": 2.6637260913848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094297, + "balance_loss_mlp": 1.06356478, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.06884386609789231, + "language_loss": 0.81979561, + "learning_rate": 0.0007732681411575621, + "loss": 0.83073854, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.30712891, + "step": 1749, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.0555166, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.052237930998467595, + "language_loss": 0.87234819, + "learning_rate": 0.0007730071915614514, + "loss": 0.88321906, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.31542969, + "step": 1750, + "time_per_iteration": 2.707857370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_mlp": 1.05896115, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.08336153438972979, + "language_loss": 0.88963622, + "learning_rate": 0.0007727461359745489, + "loss": 0.90053463, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.30859375, + "step": 1751, + "time_per_iteration": 2.482837438583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093668, + "balance_loss_mlp": 1.06307864, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05330176149069141, + "language_loss": 0.86016554, + "learning_rate": 0.0007724849744982056, + "loss": 0.87110221, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.30541992, + "step": 1752, + "time_per_iteration": 2.690420389175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097033, + "balance_loss_mlp": 1.06668198, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.0643678921459399, + "language_loss": 0.81981385, + "learning_rate": 0.0007722237072338131, + "loss": 0.8307842, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.30322266, + "step": 1753, + "time_per_iteration": 2.7154347896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097395, + "balance_loss_mlp": 1.06694901, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.07107791288081117, + "language_loss": 0.85213387, + "learning_rate": 0.0007719623342828046, + "loss": 0.8631078, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.30419922, + "step": 1754, + "time_per_iteration": 2.5009355545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109586, + "balance_loss_mlp": 1.06426978, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.06326183968549627, + "language_loss": 0.84134084, + "learning_rate": 0.000771700855746654, + "loss": 0.85229945, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.31567383, + "step": 1755, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082281, + "balance_loss_mlp": 1.05071473, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.06130822269954804, + "language_loss": 0.88395244, + "learning_rate": 0.0007714392717268763, + "loss": 0.89477527, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.31542969, + "step": 1756, + "time_per_iteration": 2.6147336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_mlp": 1.05219221, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.05731341996908033, + "language_loss": 0.86388242, + "learning_rate": 0.0007711775823250273, + "loss": 0.87471741, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.31298828, + "step": 1757, + "time_per_iteration": 2.5304934978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085861, + "balance_loss_mlp": 1.05455685, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.061357664780502266, + "language_loss": 0.83481395, + "learning_rate": 0.0007709157876427039, + "loss": 0.84567261, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.31274414, + "step": 1758, + "time_per_iteration": 3.1116981506347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_mlp": 1.04189849, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0592835704233285, + "language_loss": 0.85574573, + "learning_rate": 0.0007706538877815439, + "loss": 0.86648774, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.32299805, + "step": 1759, + "time_per_iteration": 2.635298728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_mlp": 1.04730105, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.04672826561746397, + "language_loss": 0.83449262, + "learning_rate": 0.0007703918828432259, + "loss": 0.84527004, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.30419922, + "step": 1760, + "time_per_iteration": 2.664783477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071091, + "balance_loss_mlp": 1.04023945, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.061026274734732225, + "language_loss": 0.88914752, + "learning_rate": 0.000770129772929469, + "loss": 0.89985847, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.30810547, + "step": 1761, + "time_per_iteration": 2.7082738876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_mlp": 1.03914273, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.058866792995701266, + "language_loss": 0.88234216, + "learning_rate": 0.0007698675581420334, + "loss": 0.89304519, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.3112793, + "step": 1762, + "time_per_iteration": 2.9119746685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.03966177, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.06738514708484569, + "language_loss": 0.78819811, + "learning_rate": 0.0007696052385827199, + "loss": 0.79890805, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.31298828, + "step": 1763, + "time_per_iteration": 2.9451980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107403, + "balance_loss_mlp": 1.04172421, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.0719800357998311, + "language_loss": 0.78192145, + "learning_rate": 0.00076934281435337, + "loss": 0.79266179, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.32299805, + "step": 1764, + "time_per_iteration": 2.8267600536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.03931201, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.06414673033674093, + "language_loss": 0.85701221, + "learning_rate": 0.0007690802855558658, + "loss": 0.86773127, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.32592773, + "step": 1765, + "time_per_iteration": 2.8825321197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_mlp": 1.04322386, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.027152559638010845, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.7743544, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.17285156, + "step": 1766, + "time_per_iteration": 4.890359401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04684353, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.06170687350837257, + "language_loss": 0.89089799, + "learning_rate": 0.0007685549146641262, + "loss": 0.90168703, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.32055664, + "step": 1767, + "time_per_iteration": 2.539238691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.04557216, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05571629344022593, + "language_loss": 0.8822673, + "learning_rate": 0.0007682920727738579, + "loss": 0.89303821, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.31494141, + "step": 1768, + "time_per_iteration": 2.512801170349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.04931498, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06175400371418068, + "language_loss": 0.8474735, + "learning_rate": 0.000768029126723369, + "loss": 0.85827971, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.31274414, + "step": 1769, + "time_per_iteration": 2.5238869190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075433, + "balance_loss_mlp": 1.04515338, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.06596681609056877, + "language_loss": 0.81544566, + "learning_rate": 0.0007677660766147447, + "loss": 0.82620001, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.30224609, + "step": 1770, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_mlp": 1.02063394, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.014856007486746849, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73508459, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.16894531, + "step": 1771, + "time_per_iteration": 4.967731475830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05113387, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.075322249241395, + "language_loss": 0.79792535, + "learning_rate": 0.0007672396646316306, + "loss": 0.8087405, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.30322266, + "step": 1772, + "time_per_iteration": 2.524365186691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_mlp": 1.05451918, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.05910937608565349, + "language_loss": 0.80291271, + "learning_rate": 0.000766976302961512, + "loss": 0.81376183, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.30371094, + "step": 1773, + "time_per_iteration": 3.002929925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_mlp": 1.0563519, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.0625889066862488, + "language_loss": 0.81081951, + "learning_rate": 0.0007667128376420003, + "loss": 0.82168746, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.30395508, + "step": 1774, + "time_per_iteration": 2.5821964740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_mlp": 1.05336761, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.06267075227744807, + "language_loss": 0.84329379, + "learning_rate": 0.0007664492687753817, + "loss": 0.85412979, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.30175781, + "step": 1775, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04769528, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.054581176728495925, + "language_loss": 0.81518859, + "learning_rate": 0.000766185596463983, + "loss": 0.8259607, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.29516602, + "step": 1776, + "time_per_iteration": 2.655543804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_mlp": 1.04993343, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.06969464274274284, + "language_loss": 0.76725864, + "learning_rate": 0.0007659218208101706, + "loss": 0.77804863, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.29003906, + "step": 1777, + "time_per_iteration": 3.1378567218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06411862, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.0529989301900612, + "language_loss": 0.84699291, + "learning_rate": 0.0007656579419163515, + "loss": 0.85792446, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.29052734, + "step": 1778, + "time_per_iteration": 2.8120994567871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091459, + "balance_loss_mlp": 1.06239629, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.06282493199141514, + "language_loss": 0.76994503, + "learning_rate": 0.0007653939598849724, + "loss": 0.78085959, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.2902832, + "step": 1779, + "time_per_iteration": 2.5995492935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.07051396, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.04507156484415478, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83967406, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16699219, + "step": 1780, + "time_per_iteration": 4.9175097942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_mlp": 1.07186341, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.05745476314946865, + "language_loss": 0.79740059, + "learning_rate": 0.000764865686819522, + "loss": 0.80842102, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.30151367, + "step": 1781, + "time_per_iteration": 3.1022064685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.06907511, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.061017866945560745, + "language_loss": 0.85627258, + "learning_rate": 0.0007646013959905449, + "loss": 0.8672511, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.28759766, + "step": 1782, + "time_per_iteration": 2.625312566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090603, + "balance_loss_mlp": 1.06030035, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05493462983431466, + "language_loss": 0.80768538, + "learning_rate": 0.0007643370024341949, + "loss": 0.81859136, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.30249023, + "step": 1783, + "time_per_iteration": 3.1206953525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_mlp": 1.06284761, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.04934338548004703, + "language_loss": 0.8289808, + "learning_rate": 0.0007640725062531195, + "loss": 0.83990133, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.29174805, + "step": 1784, + "time_per_iteration": 2.518277645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092006, + "balance_loss_mlp": 1.06165504, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.061838155255473454, + "language_loss": 0.8616311, + "learning_rate": 0.0007638079075500047, + "loss": 0.8725512, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.30297852, + "step": 1785, + "time_per_iteration": 2.566340684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056366, + "balance_loss_mlp": 1.04101145, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.03141321768780463, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76237035, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.15332031, + "step": 1786, + "time_per_iteration": 4.984891891479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_mlp": 1.05088782, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.0502662811310507, + "language_loss": 0.83153242, + "learning_rate": 0.0007632784029886026, + "loss": 0.84235144, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.30981445, + "step": 1787, + "time_per_iteration": 2.6574935913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_mlp": 1.04832625, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.058652751735253, + "language_loss": 0.85391539, + "learning_rate": 0.0007630134973358873, + "loss": 0.86470503, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.3059082, + "step": 1788, + "time_per_iteration": 2.920311450958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_mlp": 1.05702209, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05633660644162356, + "language_loss": 0.86888337, + "learning_rate": 0.0007627484895722763, + "loss": 0.87976426, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.31030273, + "step": 1789, + "time_per_iteration": 2.648061513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.05268025, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.08125120447961011, + "language_loss": 0.79987907, + "learning_rate": 0.0007624833798006552, + "loss": 0.8107022, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.29614258, + "step": 1790, + "time_per_iteration": 3.083303689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_mlp": 1.05249596, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.06337905919609309, + "language_loss": 0.83924425, + "learning_rate": 0.0007622181681239483, + "loss": 0.85006905, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.29931641, + "step": 1791, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_mlp": 1.04677427, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.05139164694864183, + "language_loss": 0.84563744, + "learning_rate": 0.0007619528546451202, + "loss": 0.85641772, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.31225586, + "step": 1792, + "time_per_iteration": 2.7847092151641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.05183685, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.060391852587241154, + "language_loss": 0.8357141, + "learning_rate": 0.0007616874394671745, + "loss": 0.84653878, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.3059082, + "step": 1793, + "time_per_iteration": 3.3427343368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05632687, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.07229882199780847, + "language_loss": 0.85033429, + "learning_rate": 0.0007614219226931547, + "loss": 0.86121154, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.3137207, + "step": 1794, + "time_per_iteration": 2.6797611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090025, + "balance_loss_mlp": 1.05931664, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.057715322830613675, + "language_loss": 0.84206641, + "learning_rate": 0.0007611563044261435, + "loss": 0.85296667, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.30664062, + "step": 1795, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_mlp": 1.05543017, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.06328741897936851, + "language_loss": 0.86560625, + "learning_rate": 0.0007608905847692631, + "loss": 0.87647337, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.3125, + "step": 1796, + "time_per_iteration": 2.472182035446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081946, + "balance_loss_mlp": 1.05014098, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.053847624873276365, + "language_loss": 0.86582637, + "learning_rate": 0.0007606247638256749, + "loss": 0.8766458, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.31787109, + "step": 1797, + "time_per_iteration": 2.842547655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147955, + "balance_loss_mlp": 1.13145602, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.06482996241123744, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79318249, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.16503906, + "step": 1798, + "time_per_iteration": 4.918993949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075567, + "balance_loss_mlp": 1.06011796, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.04230684388330953, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80402768, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.15429688, + "step": 1799, + "time_per_iteration": 4.791706323623657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.04724216, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.06124115711212235, + "language_loss": 0.85762143, + "learning_rate": 0.0007598266943068686, + "loss": 0.86839759, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.30322266, + "step": 1800, + "time_per_iteration": 2.743213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_mlp": 1.05266404, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.13184352245004016, + "language_loss": 0.83900499, + "learning_rate": 0.0007595604692488507, + "loss": 0.84984374, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31176758, + "step": 1801, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05105186, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.0617697315453188, + "language_loss": 0.82875979, + "learning_rate": 0.0007592941434205215, + "loss": 0.83958554, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.31494141, + "step": 1802, + "time_per_iteration": 2.803941488265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_mlp": 1.06292093, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.03209988868756776, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74648476, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.14453125, + "step": 1803, + "time_per_iteration": 5.115894794464111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073735, + "balance_loss_mlp": 1.04176331, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.057797440709038125, + "language_loss": 0.7980904, + "learning_rate": 0.0007587611898665566, + "loss": 0.80882776, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.31958008, + "step": 1804, + "time_per_iteration": 3.0783464908599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_mlp": 1.04958522, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.052922401600576395, + "language_loss": 0.8228178, + "learning_rate": 0.0007584945623478315, + "loss": 0.83362216, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.30810547, + "step": 1805, + "time_per_iteration": 2.8341996669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107388, + "balance_loss_mlp": 1.04178858, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.05986711270473425, + "language_loss": 0.81165981, + "learning_rate": 0.000758227834472617, + "loss": 0.82239866, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32080078, + "step": 1806, + "time_per_iteration": 3.0486085414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.04971278, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.06433807190471491, + "language_loss": 0.77163357, + "learning_rate": 0.0007579610063444664, + "loss": 0.78245926, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.32861328, + "step": 1807, + "time_per_iteration": 2.7597365379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073013, + "balance_loss_mlp": 1.04068375, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.06573509148212295, + "language_loss": 0.8740322, + "learning_rate": 0.0007576940780669712, + "loss": 0.88476229, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32324219, + "step": 1808, + "time_per_iteration": 3.2193737030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.04060304, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.07068655640298144, + "language_loss": 0.84018815, + "learning_rate": 0.0007574270497437624, + "loss": 0.85092652, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33251953, + "step": 1809, + "time_per_iteration": 2.958071708679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04255509, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.05267537563651592, + "language_loss": 0.88190216, + "learning_rate": 0.000757159921478509, + "loss": 0.89264333, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.31542969, + "step": 1810, + "time_per_iteration": 2.743820905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_mlp": 1.10993648, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.032772528197798495, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75575733, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.15136719, + "step": 1811, + "time_per_iteration": 4.734825372695923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_mlp": 1.04713607, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.06138203683055377, + "language_loss": 0.87334222, + "learning_rate": 0.0007566253655367423, + "loss": 0.88411689, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.30273438, + "step": 1812, + "time_per_iteration": 2.5963358879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.04946637, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.05073723218815133, + "language_loss": 0.89626348, + "learning_rate": 0.000756357938067762, + "loss": 0.90707672, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.31835938, + "step": 1813, + "time_per_iteration": 2.6791560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088512, + "balance_loss_mlp": 1.05615854, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.07107132576327291, + "language_loss": 0.82739902, + "learning_rate": 0.0007560904110718033, + "loss": 0.83828408, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32324219, + "step": 1814, + "time_per_iteration": 3.251187801361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05244136, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.056660731031110724, + "language_loss": 0.83390886, + "learning_rate": 0.0007558227846527297, + "loss": 0.84475422, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.32080078, + "step": 1815, + "time_per_iteration": 2.852786064147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086181, + "balance_loss_mlp": 1.05358887, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.06752757018776132, + "language_loss": 0.83192128, + "learning_rate": 0.0007555550589144429, + "loss": 0.84278309, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.32592773, + "step": 1816, + "time_per_iteration": 2.4226694107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108673, + "balance_loss_mlp": 1.05568814, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.05637535729014081, + "language_loss": 0.84440207, + "learning_rate": 0.000755287233960883, + "loss": 0.85526937, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.31005859, + "step": 1817, + "time_per_iteration": 2.556528329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081988, + "balance_loss_mlp": 1.04963493, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06861190177202381, + "language_loss": 0.77555025, + "learning_rate": 0.0007550193098960292, + "loss": 0.7863701, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32348633, + "step": 1818, + "time_per_iteration": 2.9168636798858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_mlp": 1.04902124, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.04890635253674866, + "language_loss": 0.85897982, + "learning_rate": 0.0007547512868238988, + "loss": 0.86979043, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.3203125, + "step": 1819, + "time_per_iteration": 3.147949695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_mlp": 1.05583739, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.07359678742691168, + "language_loss": 0.83527619, + "learning_rate": 0.0007544831648485473, + "loss": 0.84614623, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.3112793, + "step": 1820, + "time_per_iteration": 2.683906078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_mlp": 1.05272126, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.07119738396785501, + "language_loss": 0.81087327, + "learning_rate": 0.0007542149440740694, + "loss": 0.82171333, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.3125, + "step": 1821, + "time_per_iteration": 2.738029718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107983, + "balance_loss_mlp": 1.04850197, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.07229829340096756, + "language_loss": 0.8569001, + "learning_rate": 0.000753946624604597, + "loss": 0.86769843, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.31298828, + "step": 1822, + "time_per_iteration": 2.7263731956481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079169, + "balance_loss_mlp": 1.04795969, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.05660966900473529, + "language_loss": 0.87968546, + "learning_rate": 0.0007536782065443015, + "loss": 0.89047718, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.31176758, + "step": 1823, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_mlp": 1.05386138, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06227259781784348, + "language_loss": 0.74483079, + "learning_rate": 0.0007534096899973919, + "loss": 0.75567335, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.3034668, + "step": 1824, + "time_per_iteration": 2.609548807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_mlp": 1.04804349, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05520550621954613, + "language_loss": 0.82636261, + "learning_rate": 0.0007531410750681154, + "loss": 0.83715534, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.31201172, + "step": 1825, + "time_per_iteration": 2.7306325435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094474, + "balance_loss_mlp": 1.06352782, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.04890512262044313, + "language_loss": 0.86351258, + "learning_rate": 0.0007528723618607575, + "loss": 0.8744573, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.30908203, + "step": 1826, + "time_per_iteration": 3.4343338012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088582, + "balance_loss_mlp": 1.05782557, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.05382597898667073, + "language_loss": 0.82364488, + "learning_rate": 0.0007526035504796422, + "loss": 0.83453071, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.30737305, + "step": 1827, + "time_per_iteration": 2.7783889770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_mlp": 1.05721426, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.07196751046410012, + "language_loss": 0.86701363, + "learning_rate": 0.0007523346410291312, + "loss": 0.87790149, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.31542969, + "step": 1828, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096578, + "balance_loss_mlp": 1.06434393, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.05953464089235074, + "language_loss": 0.84491026, + "learning_rate": 0.0007520656336136245, + "loss": 0.85587609, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32226562, + "step": 1829, + "time_per_iteration": 2.9498770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095972, + "balance_loss_mlp": 1.0648104, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.05500553487662277, + "language_loss": 0.87983966, + "learning_rate": 0.0007517965283375599, + "loss": 0.89079928, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.3112793, + "step": 1830, + "time_per_iteration": 2.838120698928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097926, + "balance_loss_mlp": 1.06566763, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.053691241766720514, + "language_loss": 0.89336729, + "learning_rate": 0.0007515273253054132, + "loss": 0.90434659, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32250977, + "step": 1831, + "time_per_iteration": 2.6600866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092956, + "balance_loss_mlp": 1.06191444, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.05928754583625919, + "language_loss": 0.82674569, + "learning_rate": 0.0007512580246216988, + "loss": 0.83767527, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.31005859, + "step": 1832, + "time_per_iteration": 2.7806639671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089641, + "balance_loss_mlp": 1.05752611, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.0631616677310412, + "language_loss": 0.84810489, + "learning_rate": 0.000750988626390968, + "loss": 0.85900134, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32104492, + "step": 1833, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087885, + "balance_loss_mlp": 1.0560801, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.053730319302775706, + "language_loss": 0.84857321, + "learning_rate": 0.0007507191307178108, + "loss": 0.85945207, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.31787109, + "step": 1834, + "time_per_iteration": 2.822472095489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05785227, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.07238185360826516, + "language_loss": 0.74172056, + "learning_rate": 0.0007504495377068543, + "loss": 0.75260878, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.30932617, + "step": 1835, + "time_per_iteration": 2.758622884750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.06250441, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06860617015764896, + "language_loss": 0.81217551, + "learning_rate": 0.0007501798474627642, + "loss": 0.82311678, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.31591797, + "step": 1836, + "time_per_iteration": 2.932610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.06568563, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.06442397939494823, + "language_loss": 0.83527768, + "learning_rate": 0.0007499100600902433, + "loss": 0.8462323, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.29736328, + "step": 1837, + "time_per_iteration": 3.0089991092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089306, + "balance_loss_mlp": 1.05845428, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06893251529793973, + "language_loss": 0.83798671, + "learning_rate": 0.0007496401756940324, + "loss": 0.84887969, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.30810547, + "step": 1838, + "time_per_iteration": 2.6746418476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.06029606, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.06403380726847299, + "language_loss": 0.82561135, + "learning_rate": 0.0007493701943789098, + "loss": 0.83651948, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.3046875, + "step": 1839, + "time_per_iteration": 2.7678062915802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092399, + "balance_loss_mlp": 1.06307316, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.057234368489623245, + "language_loss": 0.82641804, + "learning_rate": 0.000749100116249692, + "loss": 0.83734202, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.29272461, + "step": 1840, + "time_per_iteration": 2.6124982833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_mlp": 1.0616498, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.09225915028059628, + "language_loss": 0.86273944, + "learning_rate": 0.0007488299414112321, + "loss": 0.87365901, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.30249023, + "step": 1841, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087223, + "balance_loss_mlp": 1.05737281, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.0557731038759208, + "language_loss": 0.77796137, + "learning_rate": 0.0007485596699684215, + "loss": 0.78883362, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.2980957, + "step": 1842, + "time_per_iteration": 2.83414626121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_mlp": 1.05561948, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.04938820360777142, + "language_loss": 0.85113978, + "learning_rate": 0.000748289302026189, + "loss": 0.86201257, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.31640625, + "step": 1843, + "time_per_iteration": 2.8805251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_mlp": 1.05403841, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06499404847276229, + "language_loss": 0.85830677, + "learning_rate": 0.0007480188376895004, + "loss": 0.86915159, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.30395508, + "step": 1844, + "time_per_iteration": 3.0965142250061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_mlp": 1.04624832, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.026974392702602535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74874085, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.16503906, + "step": 1845, + "time_per_iteration": 5.003226280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.05738342, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.11496133406812095, + "language_loss": 0.78570682, + "learning_rate": 0.0007474776202528074, + "loss": 0.79659295, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.31201172, + "step": 1846, + "time_per_iteration": 2.9579098224639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089072, + "balance_loss_mlp": 1.05736208, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.06294098896241457, + "language_loss": 0.81369591, + "learning_rate": 0.000747206867362922, + "loss": 0.82458663, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.31689453, + "step": 1847, + "time_per_iteration": 3.0886905193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109789, + "balance_loss_mlp": 1.06656218, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.060378794046525276, + "language_loss": 0.83593512, + "learning_rate": 0.0007469360184988194, + "loss": 0.84691405, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.31298828, + "step": 1848, + "time_per_iteration": 2.861438512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109845, + "balance_loss_mlp": 1.06724131, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.06250375704468988, + "language_loss": 0.86663848, + "learning_rate": 0.0007466650737656518, + "loss": 0.87762296, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.31176758, + "step": 1849, + "time_per_iteration": 2.620384454727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098996, + "balance_loss_mlp": 1.06754851, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05619364173691644, + "language_loss": 0.90150386, + "learning_rate": 0.0007463940332686098, + "loss": 0.91249382, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.31420898, + "step": 1850, + "time_per_iteration": 2.499337911605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_mlp": 1.06711888, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.05220134930851383, + "language_loss": 0.8454684, + "learning_rate": 0.0007461228971129205, + "loss": 0.85644454, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.30444336, + "step": 1851, + "time_per_iteration": 2.91583251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_mlp": 1.06049538, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.06507053577711389, + "language_loss": 0.85374135, + "learning_rate": 0.0007458516654038483, + "loss": 0.8646493, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.30297852, + "step": 1852, + "time_per_iteration": 2.710845947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06221175, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.055267605083424515, + "language_loss": 0.86826843, + "learning_rate": 0.0007455803382466946, + "loss": 0.87919998, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.30908203, + "step": 1853, + "time_per_iteration": 2.8157601356506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089896, + "balance_loss_mlp": 1.05894923, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.06143674576014299, + "language_loss": 0.87150055, + "learning_rate": 0.0007453089157467979, + "loss": 0.8823995, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.30908203, + "step": 1854, + "time_per_iteration": 2.7985024452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_mlp": 1.06946826, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.06203911404438901, + "language_loss": 0.82222199, + "learning_rate": 0.0007450373980095341, + "loss": 0.83323234, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.31542969, + "step": 1855, + "time_per_iteration": 3.0960283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_mlp": 1.07108843, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.05169641299516589, + "language_loss": 0.86845142, + "learning_rate": 0.0007447657851403155, + "loss": 0.87946558, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.30322266, + "step": 1856, + "time_per_iteration": 2.6420810222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106839, + "balance_loss_mlp": 1.07689333, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.07027910399075639, + "language_loss": 0.78771162, + "learning_rate": 0.0007444940772445915, + "loss": 0.79878008, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.29907227, + "step": 1857, + "time_per_iteration": 2.748770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109389, + "balance_loss_mlp": 1.06420684, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.057407361829253975, + "language_loss": 0.80228555, + "learning_rate": 0.0007442222744278484, + "loss": 0.81322443, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.29663086, + "step": 1858, + "time_per_iteration": 2.652111530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094475, + "balance_loss_mlp": 1.06410074, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.045384089682170406, + "language_loss": 0.8399753, + "learning_rate": 0.0007439503767956099, + "loss": 0.85092002, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.30371094, + "step": 1859, + "time_per_iteration": 2.703261375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03111064, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.02493030642290896, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80715972, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.1328125, + "step": 1860, + "time_per_iteration": 4.983760833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092897, + "balance_loss_mlp": 1.06242704, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.05045998946960442, + "language_loss": 0.85959804, + "learning_rate": 0.000743406297506922, + "loss": 0.87052703, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.30419922, + "step": 1861, + "time_per_iteration": 2.740078926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090008, + "balance_loss_mlp": 1.05956221, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.05968554082553822, + "language_loss": 0.8392486, + "learning_rate": 0.0007431341160617031, + "loss": 0.85014868, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.30395508, + "step": 1862, + "time_per_iteration": 2.8886373043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076671, + "balance_loss_mlp": 1.04631984, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.053643840261235066, + "language_loss": 0.88015211, + "learning_rate": 0.0007428618402234491, + "loss": 0.89091879, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.30297852, + "step": 1863, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04334283, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.062332671108041963, + "language_loss": 0.80358481, + "learning_rate": 0.0007425894700978668, + "loss": 0.81432676, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.30810547, + "step": 1864, + "time_per_iteration": 2.7334656715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072556, + "balance_loss_mlp": 1.04101336, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.050645747658019255, + "language_loss": 0.79510379, + "learning_rate": 0.0007423170057906996, + "loss": 0.80582935, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.31542969, + "step": 1865, + "time_per_iteration": 3.8669073581695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076041, + "balance_loss_mlp": 1.04452205, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06345597879427126, + "language_loss": 0.86289865, + "learning_rate": 0.0007420444474077275, + "loss": 0.87365907, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.31518555, + "step": 1866, + "time_per_iteration": 2.5648367404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080689, + "balance_loss_mlp": 1.04878831, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.058480526362169126, + "language_loss": 0.89744091, + "learning_rate": 0.0007417717950547671, + "loss": 0.90824777, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.31884766, + "step": 1867, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074714, + "balance_loss_mlp": 1.0600276, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.04131149216661822, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77071321, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.14648438, + "step": 1868, + "time_per_iteration": 4.900072813034058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.06035757, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.04948067344873762, + "language_loss": 0.84714514, + "learning_rate": 0.0007412262088623299, + "loss": 0.85806173, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.31274414, + "step": 1869, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_mlp": 1.06255615, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.0631690153505957, + "language_loss": 0.79514921, + "learning_rate": 0.0007409532752346684, + "loss": 0.80607969, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.30444336, + "step": 1870, + "time_per_iteration": 2.646813154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05436683, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.05200384527654752, + "language_loss": 0.88430232, + "learning_rate": 0.0007406802480606491, + "loss": 0.89514613, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.29956055, + "step": 1871, + "time_per_iteration": 2.6335039138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088571, + "balance_loss_mlp": 1.05819631, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.058340376963862656, + "language_loss": 0.90469301, + "learning_rate": 0.0007404071274462707, + "loss": 0.91557872, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.3034668, + "step": 1872, + "time_per_iteration": 2.579155206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088392, + "balance_loss_mlp": 1.05911398, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06288764850432389, + "language_loss": 0.83945811, + "learning_rate": 0.0007401339134975682, + "loss": 0.85034204, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.29272461, + "step": 1873, + "time_per_iteration": 2.6590254306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089736, + "balance_loss_mlp": 1.06024313, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.07025897777145818, + "language_loss": 0.84501064, + "learning_rate": 0.0007398606063206122, + "loss": 0.85590804, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.29467773, + "step": 1874, + "time_per_iteration": 2.6330654621124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_mlp": 1.05545354, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05525815693458704, + "language_loss": 0.78668261, + "learning_rate": 0.0007395872060215101, + "loss": 0.79753017, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.29296875, + "step": 1875, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_mlp": 1.05853248, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.05566722247490556, + "language_loss": 0.88191175, + "learning_rate": 0.0007393137127064056, + "loss": 0.89278299, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.28588867, + "step": 1876, + "time_per_iteration": 2.67520809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_mlp": 1.05479455, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05183280051917729, + "language_loss": 0.84175742, + "learning_rate": 0.0007390401264814779, + "loss": 0.85258996, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.28491211, + "step": 1877, + "time_per_iteration": 2.621708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05559897, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.059598774698536174, + "language_loss": 0.84762645, + "learning_rate": 0.0007387664474529427, + "loss": 0.85846466, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.28222656, + "step": 1878, + "time_per_iteration": 2.64604115486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_mlp": 1.0567776, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.05278661870548292, + "language_loss": 0.90893793, + "learning_rate": 0.0007384926757270518, + "loss": 0.91979533, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.28955078, + "step": 1879, + "time_per_iteration": 2.63849139213562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094605, + "balance_loss_mlp": 1.0652554, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.05095981973878578, + "language_loss": 0.79965544, + "learning_rate": 0.0007382188114100924, + "loss": 0.81060153, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.29296875, + "step": 1880, + "time_per_iteration": 2.967137098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_mlp": 1.06731534, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.0523610100033388, + "language_loss": 0.81541228, + "learning_rate": 0.0007379448546083884, + "loss": 0.82638228, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.29663086, + "step": 1881, + "time_per_iteration": 2.935075283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089574, + "balance_loss_mlp": 1.06036723, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.056326792126263736, + "language_loss": 0.88131809, + "learning_rate": 0.0007376708054282992, + "loss": 0.89221382, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.29174805, + "step": 1882, + "time_per_iteration": 2.9548256397247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080549, + "balance_loss_mlp": 1.05074644, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.053377968629185854, + "language_loss": 0.8395232, + "learning_rate": 0.0007373966639762201, + "loss": 0.85032874, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.29785156, + "step": 1883, + "time_per_iteration": 2.5978147983551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_mlp": 1.05085516, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.055969169447774005, + "language_loss": 0.88542271, + "learning_rate": 0.0007371224303585822, + "loss": 0.8962214, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.29003906, + "step": 1884, + "time_per_iteration": 2.573521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122192, + "balance_loss_mlp": 1.10817313, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.05390094690370155, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81479263, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.140625, + "step": 1885, + "time_per_iteration": 4.762617826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077599, + "balance_loss_mlp": 1.04722452, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05279204841925659, + "language_loss": 0.8277564, + "learning_rate": 0.0007365736870525335, + "loss": 0.83853239, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.30322266, + "step": 1886, + "time_per_iteration": 2.8206799030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071958, + "balance_loss_mlp": 1.04182231, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.0631822735743998, + "language_loss": 0.82252121, + "learning_rate": 0.000736299177577164, + "loss": 0.83324087, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.30102539, + "step": 1887, + "time_per_iteration": 2.5644423961639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075611, + "balance_loss_mlp": 1.04516482, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.06952119877485304, + "language_loss": 0.83928037, + "learning_rate": 0.0007360245763623174, + "loss": 0.8500365, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.30395508, + "step": 1888, + "time_per_iteration": 2.68868088722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_mlp": 1.04614949, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.05500458280543127, + "language_loss": 0.89759338, + "learning_rate": 0.0007357498835146039, + "loss": 0.90835977, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.30444336, + "step": 1889, + "time_per_iteration": 2.841135263442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078037, + "balance_loss_mlp": 1.04716182, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.05518095134274227, + "language_loss": 0.86945391, + "learning_rate": 0.0007354750991406684, + "loss": 0.8802343, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.30834961, + "step": 1890, + "time_per_iteration": 2.6954762935638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04810333, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.060964398763012274, + "language_loss": 0.80524838, + "learning_rate": 0.0007352002233471919, + "loss": 0.81604487, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.31518555, + "step": 1891, + "time_per_iteration": 2.6167404651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04973292, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.06807309201777603, + "language_loss": 0.79092562, + "learning_rate": 0.0007349252562408906, + "loss": 0.80172026, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.296875, + "step": 1892, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091379, + "balance_loss_mlp": 1.06071806, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.05563142804906438, + "language_loss": 0.81399196, + "learning_rate": 0.0007346501979285158, + "loss": 0.82490575, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.30615234, + "step": 1893, + "time_per_iteration": 2.8852903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_mlp": 1.06208813, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02944776437417564, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8161397, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.12792969, + "step": 1894, + "time_per_iteration": 4.784174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114227, + "balance_loss_mlp": 1.0819447, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.051755500006301046, + "language_loss": 0.8558799, + "learning_rate": 0.0007340998081127308, + "loss": 0.86702216, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.32275391, + "step": 1895, + "time_per_iteration": 2.807494878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121943, + "balance_loss_mlp": 1.09023345, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.06567695066031824, + "language_loss": 0.90748346, + "learning_rate": 0.0007338244768230007, + "loss": 0.9187029, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.31689453, + "step": 1896, + "time_per_iteration": 2.7678794860839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118221, + "balance_loss_mlp": 1.08694077, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.07782470610585689, + "language_loss": 0.8913762, + "learning_rate": 0.0007335490547545578, + "loss": 0.90255845, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.3125, + "step": 1897, + "time_per_iteration": 3.0801138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112607, + "balance_loss_mlp": 1.0822562, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.05264242736204855, + "language_loss": 0.82653165, + "learning_rate": 0.0007332735420143308, + "loss": 0.83765769, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.30297852, + "step": 1898, + "time_per_iteration": 2.7581489086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094572, + "balance_loss_mlp": 1.06338716, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.06387883695900265, + "language_loss": 0.8681283, + "learning_rate": 0.0007329979387092826, + "loss": 0.87907398, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.31152344, + "step": 1899, + "time_per_iteration": 2.586489677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.05964673, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.054083416077733606, + "language_loss": 0.83626556, + "learning_rate": 0.0007327222449464124, + "loss": 0.84716845, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.3059082, + "step": 1900, + "time_per_iteration": 3.2495076656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_mlp": 1.0518986, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.05500564094416643, + "language_loss": 0.88598847, + "learning_rate": 0.0007324464608327538, + "loss": 0.89683151, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.32397461, + "step": 1901, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079363, + "balance_loss_mlp": 1.04786777, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.0538418205513684, + "language_loss": 0.88291639, + "learning_rate": 0.0007321705864753758, + "loss": 0.89371002, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.31469727, + "step": 1902, + "time_per_iteration": 2.69343638420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04294717, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.056477009868628435, + "language_loss": 0.84098166, + "learning_rate": 0.0007318946219813823, + "loss": 0.85172582, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.31469727, + "step": 1903, + "time_per_iteration": 3.010847568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04232407, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05768945263904951, + "language_loss": 0.89714533, + "learning_rate": 0.000731618567457912, + "loss": 0.90789449, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.32592773, + "step": 1904, + "time_per_iteration": 2.6410703659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076588, + "balance_loss_mlp": 1.0440681, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05570087619571841, + "language_loss": 0.86445332, + "learning_rate": 0.000731342423012139, + "loss": 0.87521917, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.32519531, + "step": 1905, + "time_per_iteration": 3.054703712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.04312992, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.05663901457074664, + "language_loss": 0.82393479, + "learning_rate": 0.0007310661887512722, + "loss": 0.83468342, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.31713867, + "step": 1906, + "time_per_iteration": 3.0096654891967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076944, + "balance_loss_mlp": 1.04532969, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.07427377535541638, + "language_loss": 0.8207258, + "learning_rate": 0.0007307898647825549, + "loss": 0.83149529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.31591797, + "step": 1907, + "time_per_iteration": 2.67525315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04347432, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.07021562329929035, + "language_loss": 0.89152002, + "learning_rate": 0.0007305134512132659, + "loss": 0.90227735, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.32250977, + "step": 1908, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0476923, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.07878350898766671, + "language_loss": 0.83255082, + "learning_rate": 0.0007302369481507183, + "loss": 0.84334129, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.31323242, + "step": 1909, + "time_per_iteration": 2.5106606483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108859, + "balance_loss_mlp": 1.09207463, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.039316944601114644, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.8107062, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.16796875, + "step": 1910, + "time_per_iteration": 4.845642566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_mlp": 1.04287899, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.05282525969479425, + "language_loss": 0.8551507, + "learning_rate": 0.000729683673975274, + "loss": 0.86588871, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.30883789, + "step": 1911, + "time_per_iteration": 2.643991470336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077837, + "balance_loss_mlp": 1.04648542, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.06579029503933971, + "language_loss": 0.83071077, + "learning_rate": 0.0007294069030771774, + "loss": 0.84148908, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.31323242, + "step": 1912, + "time_per_iteration": 3.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081127, + "balance_loss_mlp": 1.05053759, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055639286508135585, + "language_loss": 0.90529931, + "learning_rate": 0.0007291300431154224, + "loss": 0.91611063, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.30541992, + "step": 1913, + "time_per_iteration": 2.6364145278930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020102, + "balance_loss_mlp": 1.00503433, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.014819520409209537, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71409839, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.15039062, + "step": 1914, + "time_per_iteration": 4.986552000045776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089166, + "balance_loss_mlp": 1.05895889, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.07166131614104637, + "language_loss": 0.80129957, + "learning_rate": 0.0007285760564309179, + "loss": 0.81219125, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.30151367, + "step": 1915, + "time_per_iteration": 3.105180025100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.05362058, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.07315246202889085, + "language_loss": 0.85023272, + "learning_rate": 0.0007282989299232448, + "loss": 0.86106199, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.29272461, + "step": 1916, + "time_per_iteration": 3.0501549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_mlp": 1.05710506, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.0682472178493412, + "language_loss": 0.83468378, + "learning_rate": 0.0007280217147820668, + "loss": 0.84554267, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.28735352, + "step": 1917, + "time_per_iteration": 2.61570143699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.06836295, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.06368361877082852, + "language_loss": 0.79183483, + "learning_rate": 0.0007277444111150079, + "loss": 0.80280429, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.28613281, + "step": 1918, + "time_per_iteration": 2.7004950046539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_mlp": 1.06124449, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.07280537378335762, + "language_loss": 0.84052753, + "learning_rate": 0.0007274670190297272, + "loss": 0.85142708, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.28710938, + "step": 1919, + "time_per_iteration": 2.598128080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06902122, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.05243134255501039, + "language_loss": 0.82081646, + "learning_rate": 0.0007271895386339179, + "loss": 0.83180475, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.29736328, + "step": 1920, + "time_per_iteration": 2.7843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093148, + "balance_loss_mlp": 1.06360769, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.058714378397154585, + "language_loss": 0.83102447, + "learning_rate": 0.0007269119700353073, + "loss": 0.8419559, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.29492188, + "step": 1921, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_mlp": 1.06052053, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04695414461356542, + "language_loss": 0.84780574, + "learning_rate": 0.0007266343133416571, + "loss": 0.85869944, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.28833008, + "step": 1922, + "time_per_iteration": 2.779585361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065569, + "balance_loss_mlp": 1.05011928, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.04139595668748732, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78182483, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.15429688, + "step": 1923, + "time_per_iteration": 4.841213703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_mlp": 1.05591547, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.07673769099321799, + "language_loss": 0.84293365, + "learning_rate": 0.0007260787361004556, + "loss": 0.85378897, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.2956543, + "step": 1924, + "time_per_iteration": 2.5501017570495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_mlp": 1.00875258, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.01226438472350035, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74784565, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.14257812, + "step": 1925, + "time_per_iteration": 4.9058191776275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05040073, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.0733591012555623, + "language_loss": 0.87266588, + "learning_rate": 0.0007255228077730903, + "loss": 0.88345671, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.28686523, + "step": 1926, + "time_per_iteration": 2.6776785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080805, + "balance_loss_mlp": 1.05281413, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.05143591599053885, + "language_loss": 0.81313562, + "learning_rate": 0.0007252447122218632, + "loss": 0.82394373, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.2800293, + "step": 1927, + "time_per_iteration": 3.1710472106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_mlp": 1.04907489, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.07597924069729044, + "language_loss": 0.88653511, + "learning_rate": 0.0007249665292228834, + "loss": 0.89731288, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.28686523, + "step": 1928, + "time_per_iteration": 2.580092191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108352, + "balance_loss_mlp": 1.0547905, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.05796370091963761, + "language_loss": 0.8379482, + "learning_rate": 0.000724688258884151, + "loss": 0.84878337, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.28710938, + "step": 1929, + "time_per_iteration": 2.6322267055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_mlp": 1.05740142, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.049384577339976525, + "language_loss": 0.86327779, + "learning_rate": 0.0007244099013137002, + "loss": 0.87413883, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.28710938, + "step": 1930, + "time_per_iteration": 3.09224009513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_mlp": 1.05951214, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.06129670734370297, + "language_loss": 0.88767004, + "learning_rate": 0.0007241314566195993, + "loss": 0.89854914, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.28393555, + "step": 1931, + "time_per_iteration": 3.238381862640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094186, + "balance_loss_mlp": 1.06531322, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.05545779345638414, + "language_loss": 0.85434037, + "learning_rate": 0.0007238529249099496, + "loss": 0.86528224, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.28833008, + "step": 1932, + "time_per_iteration": 2.632279872894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159138, + "balance_loss_mlp": 1.1475507, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.054961579821259376, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79016018, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.11572266, + "step": 1933, + "time_per_iteration": 4.920037746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098131, + "balance_loss_mlp": 1.06902027, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.06411393233522368, + "language_loss": 0.80432916, + "learning_rate": 0.000723295600876581, + "loss": 0.81531054, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.29101562, + "step": 1934, + "time_per_iteration": 3.060438632965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_mlp": 1.06510615, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.054125512250282885, + "language_loss": 0.87856102, + "learning_rate": 0.0007230168087692344, + "loss": 0.88949579, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.28393555, + "step": 1935, + "time_per_iteration": 2.655176877975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095042, + "balance_loss_mlp": 1.06607461, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.053712544631880174, + "language_loss": 0.82501912, + "learning_rate": 0.0007227379300790839, + "loss": 0.83596957, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.28955078, + "step": 1936, + "time_per_iteration": 3.05722713470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_mlp": 1.05668318, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.05452705072121448, + "language_loss": 0.85148442, + "learning_rate": 0.0007224589649143997, + "loss": 0.86234665, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.29492188, + "step": 1937, + "time_per_iteration": 2.593818187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06021869, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08689315573767935, + "language_loss": 0.80660325, + "learning_rate": 0.0007221799133834861, + "loss": 0.81749392, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.28833008, + "step": 1938, + "time_per_iteration": 2.6238772869110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087089, + "balance_loss_mlp": 1.05869377, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.06550449761554421, + "language_loss": 0.81904262, + "learning_rate": 0.00072190077559468, + "loss": 0.8299135, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.28417969, + "step": 1939, + "time_per_iteration": 2.5338878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_mlp": 1.05649543, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.05171807924061888, + "language_loss": 0.89000612, + "learning_rate": 0.0007216215516563527, + "loss": 0.90086764, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.29589844, + "step": 1940, + "time_per_iteration": 2.717912435531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_mlp": 1.05449796, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.06398735943962416, + "language_loss": 0.83462608, + "learning_rate": 0.0007213422416769083, + "loss": 0.84545934, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.28808594, + "step": 1941, + "time_per_iteration": 2.6354072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107949, + "balance_loss_mlp": 1.0511179, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05310409823342424, + "language_loss": 0.75118601, + "learning_rate": 0.0007210628457647849, + "loss": 0.76198089, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.28369141, + "step": 1942, + "time_per_iteration": 2.573251724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080746, + "balance_loss_mlp": 1.05118251, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.05561530112530558, + "language_loss": 0.78689432, + "learning_rate": 0.000720783364028453, + "loss": 0.79770184, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.29516602, + "step": 1943, + "time_per_iteration": 2.782897472381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078848, + "balance_loss_mlp": 1.04935515, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05583674557333592, + "language_loss": 0.87426305, + "learning_rate": 0.0007205037965764177, + "loss": 0.88505149, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.29467773, + "step": 1944, + "time_per_iteration": 2.577195167541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_mlp": 1.04740369, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05970518460248593, + "language_loss": 0.8568424, + "learning_rate": 0.0007202241435172161, + "loss": 0.86760962, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.29296875, + "step": 1945, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04849827, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.057784843601785166, + "language_loss": 0.88219595, + "learning_rate": 0.0007199444049594198, + "loss": 0.89296943, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.28833008, + "step": 1946, + "time_per_iteration": 2.997744560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075997, + "balance_loss_mlp": 1.04681468, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.05996621635377081, + "language_loss": 0.83343232, + "learning_rate": 0.0007196645810116322, + "loss": 0.84419227, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.29150391, + "step": 1947, + "time_per_iteration": 2.6596434116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071198, + "balance_loss_mlp": 1.04308891, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.07792528533349045, + "language_loss": 0.8387686, + "learning_rate": 0.0007193846717824912, + "loss": 0.84948057, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.28149414, + "step": 1948, + "time_per_iteration": 2.87357759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04031014, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06284621907245236, + "language_loss": 0.88014293, + "learning_rate": 0.0007191046773806669, + "loss": 0.89082038, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.27514648, + "step": 1949, + "time_per_iteration": 2.616118907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073776, + "balance_loss_mlp": 1.04473686, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06080214721481266, + "language_loss": 0.83072305, + "learning_rate": 0.0007188245979148631, + "loss": 0.84146082, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.29003906, + "step": 1950, + "time_per_iteration": 3.212918281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05164886, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.06034460157863772, + "language_loss": 0.87560785, + "learning_rate": 0.0007185444334938157, + "loss": 0.88641185, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.28735352, + "step": 1951, + "time_per_iteration": 2.6847927570343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_mlp": 1.04635811, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.07362347851216991, + "language_loss": 0.85023165, + "learning_rate": 0.0007182641842262947, + "loss": 0.86097872, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.28320312, + "step": 1952, + "time_per_iteration": 2.6011481285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080682, + "balance_loss_mlp": 1.05252457, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.05143100601063952, + "language_loss": 0.77525514, + "learning_rate": 0.0007179838502211022, + "loss": 0.78606194, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.28198242, + "step": 1953, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.05487227, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.06528688845841664, + "language_loss": 0.86487108, + "learning_rate": 0.0007177034315870738, + "loss": 0.87569952, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.27978516, + "step": 1954, + "time_per_iteration": 2.9551377296447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04896057, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.059767476828271, + "language_loss": 0.90968794, + "learning_rate": 0.0007174229284330773, + "loss": 0.9204582, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.28076172, + "step": 1955, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.0481143, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.06317358450106399, + "language_loss": 0.87043428, + "learning_rate": 0.0007171423408680141, + "loss": 0.88119459, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.27954102, + "step": 1956, + "time_per_iteration": 2.8243377208709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.04352272, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.057758823731725896, + "language_loss": 0.89565909, + "learning_rate": 0.0007168616690008176, + "loss": 0.90638542, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.29125977, + "step": 1957, + "time_per_iteration": 2.6314306259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_mlp": 1.04572916, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.055146864479517985, + "language_loss": 0.86279052, + "learning_rate": 0.0007165809129404545, + "loss": 0.87353098, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.28320312, + "step": 1958, + "time_per_iteration": 2.7625439167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074993, + "balance_loss_mlp": 1.044595, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.06141204693847206, + "language_loss": 0.85977095, + "learning_rate": 0.0007163000727959239, + "loss": 0.87052089, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.30371094, + "step": 1959, + "time_per_iteration": 2.473407506942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061387, + "balance_loss_mlp": 1.04622388, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.02935416999593297, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79020452, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.15136719, + "step": 1960, + "time_per_iteration": 4.8784215450286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_mlp": 1.04973722, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.05722982355969982, + "language_loss": 0.84446192, + "learning_rate": 0.00071573814069052, + "loss": 0.85525477, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.29541016, + "step": 1961, + "time_per_iteration": 2.929955244064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_mlp": 1.05031538, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.053564242831421076, + "language_loss": 0.88053226, + "learning_rate": 0.0007154570489478081, + "loss": 0.8913213, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.28540039, + "step": 1962, + "time_per_iteration": 3.1691505908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079242, + "balance_loss_mlp": 1.05001187, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.05213464978332433, + "language_loss": 0.86570239, + "learning_rate": 0.0007151758735572514, + "loss": 0.87649477, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.29174805, + "step": 1963, + "time_per_iteration": 2.9893381595611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_mlp": 1.05190408, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06256473208381459, + "language_loss": 0.80730724, + "learning_rate": 0.0007148946146280119, + "loss": 0.81811094, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.28442383, + "step": 1964, + "time_per_iteration": 2.8270015716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_mlp": 1.00214851, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.01808471901321765, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73207271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12988281, + "step": 1965, + "time_per_iteration": 4.895836353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018206, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.021930840707602553, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76360154, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.12597656, + "step": 1966, + "time_per_iteration": 5.0023956298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091314, + "balance_loss_mlp": 1.06358576, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.04479252262380658, + "language_loss": 0.83477217, + "learning_rate": 0.0007140503377003022, + "loss": 0.84568524, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.27734375, + "step": 1967, + "time_per_iteration": 3.0142691135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097939, + "balance_loss_mlp": 1.07011509, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.049620821678558774, + "language_loss": 0.8500334, + "learning_rate": 0.000713768745708599, + "loss": 0.86101276, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.27856445, + "step": 1968, + "time_per_iteration": 2.6556408405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109518, + "balance_loss_mlp": 1.06807137, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.05249502952466034, + "language_loss": 0.7739228, + "learning_rate": 0.0007134870707245085, + "loss": 0.78487462, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.27148438, + "step": 1969, + "time_per_iteration": 3.2944319248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097317, + "balance_loss_mlp": 1.0706377, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06611086672726225, + "language_loss": 0.84358507, + "learning_rate": 0.0007132053128573864, + "loss": 0.85455823, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.26733398, + "step": 1970, + "time_per_iteration": 2.745910167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.07422984, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.07389156257299019, + "language_loss": 0.83986598, + "learning_rate": 0.0007129234722166211, + "loss": 0.8508774, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.26977539, + "step": 1971, + "time_per_iteration": 2.8552701473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095612, + "balance_loss_mlp": 1.06881404, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.0464186232668544, + "language_loss": 0.90731955, + "learning_rate": 0.0007126415489116328, + "loss": 0.91827571, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.26818848, + "step": 1972, + "time_per_iteration": 2.6738507747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089531, + "balance_loss_mlp": 1.06185079, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05397666452651625, + "language_loss": 0.81034803, + "learning_rate": 0.0007123595430518736, + "loss": 0.82124341, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.27685547, + "step": 1973, + "time_per_iteration": 2.8551318645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_mlp": 1.06225908, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07183677804285386, + "language_loss": 0.86159599, + "learning_rate": 0.0007120774547468282, + "loss": 0.87249249, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.27416992, + "step": 1974, + "time_per_iteration": 2.5466248989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091836, + "balance_loss_mlp": 1.06477594, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.057862181788604236, + "language_loss": 0.81643212, + "learning_rate": 0.0007117952841060128, + "loss": 0.82735044, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.27099609, + "step": 1975, + "time_per_iteration": 2.6863863468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010857, + "balance_loss_mlp": 1.05813885, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.06251241790432795, + "language_loss": 0.83861643, + "learning_rate": 0.0007115130312389756, + "loss": 0.84947342, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.27587891, + "step": 1976, + "time_per_iteration": 2.6821115016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088536, + "balance_loss_mlp": 1.0602119, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.063889045898505, + "language_loss": 0.79037011, + "learning_rate": 0.0007112306962552973, + "loss": 0.80125546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.28320312, + "step": 1977, + "time_per_iteration": 2.5958874225616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05877423, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055122671956433805, + "language_loss": 0.85178941, + "learning_rate": 0.0007109482792645896, + "loss": 0.8626554, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.27832031, + "step": 1978, + "time_per_iteration": 2.706073760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081892, + "balance_loss_mlp": 1.05363917, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06407360303991923, + "language_loss": 0.83617824, + "learning_rate": 0.0007106657803764969, + "loss": 0.84699714, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.2824707, + "step": 1979, + "time_per_iteration": 2.7429239749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078619, + "balance_loss_mlp": 1.05022287, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.07177583644367627, + "language_loss": 0.8165133, + "learning_rate": 0.0007103831997006948, + "loss": 0.82729954, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.28393555, + "step": 1980, + "time_per_iteration": 2.7360527515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072489, + "balance_loss_mlp": 1.04361689, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.06360208542685557, + "language_loss": 0.85186386, + "learning_rate": 0.0007101005373468908, + "loss": 0.86258882, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.28833008, + "step": 1981, + "time_per_iteration": 2.925529718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03775024, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.051682910059599525, + "language_loss": 0.86574209, + "learning_rate": 0.0007098177934248242, + "loss": 0.87640351, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.28369141, + "step": 1982, + "time_per_iteration": 2.7813186645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_mlp": 1.03770101, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.06153978169673806, + "language_loss": 0.85434651, + "learning_rate": 0.0007095349680442661, + "loss": 0.86501151, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.2878418, + "step": 1983, + "time_per_iteration": 2.878678321838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.04062414, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.05550499316869274, + "language_loss": 0.78828371, + "learning_rate": 0.0007092520613150188, + "loss": 0.79897726, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.28710938, + "step": 1984, + "time_per_iteration": 2.667602300643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04057729, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.04940974411679134, + "language_loss": 0.81105816, + "learning_rate": 0.0007089690733469165, + "loss": 0.82175809, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.29394531, + "step": 1985, + "time_per_iteration": 2.7445921897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077693, + "balance_loss_mlp": 1.04924965, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.0710841944315155, + "language_loss": 0.82154202, + "learning_rate": 0.000708686004249825, + "loss": 0.8323189, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.28442383, + "step": 1986, + "time_per_iteration": 2.803262948989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_mlp": 1.0459218, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053095768122865476, + "language_loss": 0.91283715, + "learning_rate": 0.0007084028541336413, + "loss": 0.92359161, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.29467773, + "step": 1987, + "time_per_iteration": 2.693894147872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_mlp": 1.04807711, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.04978295407195845, + "language_loss": 0.86100876, + "learning_rate": 0.0007081196231082942, + "loss": 0.87176782, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.27807617, + "step": 1988, + "time_per_iteration": 2.8127198219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05097318, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05417702481979702, + "language_loss": 0.80060172, + "learning_rate": 0.0007078363112837436, + "loss": 0.81139255, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.28125, + "step": 1989, + "time_per_iteration": 2.8839027881622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.04866838, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.05590772319077314, + "language_loss": 0.84895635, + "learning_rate": 0.000707552918769981, + "loss": 0.85972643, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.28344727, + "step": 1990, + "time_per_iteration": 2.4921815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075886, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.05219115858491499, + "language_loss": 0.8389315, + "learning_rate": 0.000707269445677029, + "loss": 0.84969032, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.27563477, + "step": 1991, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_mlp": 1.05205727, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.061454112768806295, + "language_loss": 0.85369635, + "learning_rate": 0.0007069858921149416, + "loss": 0.8645004, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.28344727, + "step": 1992, + "time_per_iteration": 2.953749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077015, + "balance_loss_mlp": 1.04919195, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.04324001999537677, + "language_loss": 0.86024761, + "learning_rate": 0.0007067022581938043, + "loss": 0.87101781, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.27880859, + "step": 1993, + "time_per_iteration": 2.818094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072064, + "balance_loss_mlp": 1.04502726, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06003802076808944, + "language_loss": 0.83055973, + "learning_rate": 0.0007064185440237334, + "loss": 0.84128034, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.27075195, + "step": 1994, + "time_per_iteration": 2.7304775714874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.05043745, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.054248337050939024, + "language_loss": 0.84367561, + "learning_rate": 0.0007061347497148764, + "loss": 0.85445797, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.27807617, + "step": 1995, + "time_per_iteration": 2.747483015060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074409, + "balance_loss_mlp": 1.04706264, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06054830939074019, + "language_loss": 0.86660719, + "learning_rate": 0.0007058508753774122, + "loss": 0.87735128, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.27392578, + "step": 1996, + "time_per_iteration": 2.6960108280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078362, + "balance_loss_mlp": 1.05165958, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.05196412840141252, + "language_loss": 0.86974967, + "learning_rate": 0.0007055669211215505, + "loss": 0.88053334, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.26733398, + "step": 1997, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076337, + "balance_loss_mlp": 1.04775071, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06669720231739994, + "language_loss": 0.77213579, + "learning_rate": 0.0007052828870575322, + "loss": 0.78289914, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.28588867, + "step": 1998, + "time_per_iteration": 2.6813313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_mlp": 1.05808222, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.053007093293579055, + "language_loss": 0.8636111, + "learning_rate": 0.0007049987732956291, + "loss": 0.87446344, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.27197266, + "step": 1999, + "time_per_iteration": 2.9743165969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.04323626, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.046114011394728885, + "language_loss": 0.82846403, + "learning_rate": 0.0007047145799461439, + "loss": 0.83917749, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.28149414, + "step": 2000, + "time_per_iteration": 2.85295033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_mlp": 1.0488013, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.06118237782788499, + "language_loss": 0.8185212, + "learning_rate": 0.00070443030711941, + "loss": 0.82929248, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.28295898, + "step": 2001, + "time_per_iteration": 2.7602195739746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.04918385, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.06801983854699947, + "language_loss": 0.82348108, + "learning_rate": 0.0007041459549257924, + "loss": 0.83426422, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.29101562, + "step": 2002, + "time_per_iteration": 2.8562166690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.04565787, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.07124544558687326, + "language_loss": 0.7826004, + "learning_rate": 0.0007038615234756859, + "loss": 0.79334354, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.28662109, + "step": 2003, + "time_per_iteration": 3.1888484954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_mlp": 1.0429796, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.060193135665447615, + "language_loss": 0.83578098, + "learning_rate": 0.000703577012879517, + "loss": 0.8464973, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.28662109, + "step": 2004, + "time_per_iteration": 2.6438684463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069967, + "balance_loss_mlp": 1.04185688, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.05830751128665357, + "language_loss": 0.8852784, + "learning_rate": 0.0007032924232477423, + "loss": 0.89597809, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.28149414, + "step": 2005, + "time_per_iteration": 2.6632285118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071337, + "balance_loss_mlp": 1.04253602, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.05522600702951118, + "language_loss": 0.8025552, + "learning_rate": 0.0007030077546908493, + "loss": 0.81326854, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.28808594, + "step": 2006, + "time_per_iteration": 2.6748647689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06600749, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.04192005891791234, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84142971, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12255859, + "step": 2007, + "time_per_iteration": 4.758062124252319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084632, + "balance_loss_mlp": 1.05614078, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.06495221526254255, + "language_loss": 0.79320729, + "learning_rate": 0.0007024381812438117, + "loss": 0.80405354, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.28515625, + "step": 2008, + "time_per_iteration": 2.557239532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095356, + "balance_loss_mlp": 1.06607771, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.09570560546772983, + "language_loss": 0.83017313, + "learning_rate": 0.0007021532765747951, + "loss": 0.84112668, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.29248047, + "step": 2009, + "time_per_iteration": 2.984100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.06031561, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05400711762269546, + "language_loss": 0.78963518, + "learning_rate": 0.0007018682934229162, + "loss": 0.80052131, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.28295898, + "step": 2010, + "time_per_iteration": 2.9302892684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080883, + "balance_loss_mlp": 1.05220175, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05212566321061033, + "language_loss": 0.82523775, + "learning_rate": 0.0007015832318988152, + "loss": 0.83604658, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.28662109, + "step": 2011, + "time_per_iteration": 2.65934157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_mlp": 1.0158205, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.016832038405886617, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74917436, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11523438, + "step": 2012, + "time_per_iteration": 4.964378595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076687, + "balance_loss_mlp": 1.04776716, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.05730560331399072, + "language_loss": 0.83868068, + "learning_rate": 0.0007010128741766604, + "loss": 0.84944755, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.28857422, + "step": 2013, + "time_per_iteration": 2.7196977138519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_mlp": 1.04005277, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.0608937159393576, + "language_loss": 0.843593, + "learning_rate": 0.0007007275782000391, + "loss": 0.85428894, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.29492188, + "step": 2014, + "time_per_iteration": 2.635704517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.04351759, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.061731808628827385, + "language_loss": 0.84906852, + "learning_rate": 0.0007004422042940605, + "loss": 0.85979199, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.2878418, + "step": 2015, + "time_per_iteration": 2.500502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072405, + "balance_loss_mlp": 1.04246008, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.06410146749924231, + "language_loss": 0.89413089, + "learning_rate": 0.0007001567525695169, + "loss": 0.90485489, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.29931641, + "step": 2016, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072622, + "balance_loss_mlp": 1.04410672, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.057933083917186774, + "language_loss": 0.83612067, + "learning_rate": 0.0006998712231372303, + "loss": 0.84684694, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.28491211, + "step": 2017, + "time_per_iteration": 3.0175724029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04141831, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.04866320553491467, + "language_loss": 0.86211008, + "learning_rate": 0.0006995856161080532, + "loss": 0.87281585, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.29101562, + "step": 2018, + "time_per_iteration": 2.879014015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071313, + "balance_loss_mlp": 1.04193974, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.05910223086818918, + "language_loss": 0.81994784, + "learning_rate": 0.0006992999315928679, + "loss": 0.83066106, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.29345703, + "step": 2019, + "time_per_iteration": 2.794605255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078638, + "balance_loss_mlp": 1.04826391, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.0551019421553566, + "language_loss": 0.86098075, + "learning_rate": 0.0006990141697025871, + "loss": 0.8717671, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.3034668, + "step": 2020, + "time_per_iteration": 2.808492422103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_mlp": 1.04388523, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.03291843471702338, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77415681, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12158203, + "step": 2021, + "time_per_iteration": 4.747381687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04109025, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.0700535467402408, + "language_loss": 0.82436341, + "learning_rate": 0.0006984424142405392, + "loss": 0.83506376, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.28930664, + "step": 2022, + "time_per_iteration": 2.8081154823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070367, + "balance_loss_mlp": 1.04144704, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06604387927811756, + "language_loss": 0.81889653, + "learning_rate": 0.0006981564208907474, + "loss": 0.82960021, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.2890625, + "step": 2023, + "time_per_iteration": 2.615868091583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067731, + "balance_loss_mlp": 1.03947854, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.05337785231387105, + "language_loss": 0.90169919, + "learning_rate": 0.0006978703506098102, + "loss": 0.91237652, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.2824707, + "step": 2024, + "time_per_iteration": 2.7487242221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04292357, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.05102180718564601, + "language_loss": 0.87631416, + "learning_rate": 0.00069758420350879, + "loss": 0.88702166, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.27832031, + "step": 2025, + "time_per_iteration": 2.6278607845306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03802657, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.05496821729843788, + "language_loss": 0.85941356, + "learning_rate": 0.000697297979698779, + "loss": 0.87007421, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.28051758, + "step": 2026, + "time_per_iteration": 2.773711919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072256, + "balance_loss_mlp": 1.0449574, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.054849440695872026, + "language_loss": 0.83735013, + "learning_rate": 0.0006970116792908992, + "loss": 0.84807271, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.27368164, + "step": 2027, + "time_per_iteration": 3.1274263858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071715, + "balance_loss_mlp": 1.04348612, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.0501662810644282, + "language_loss": 0.80959415, + "learning_rate": 0.000696725302396302, + "loss": 0.82031131, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.28222656, + "step": 2028, + "time_per_iteration": 2.653289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078388, + "balance_loss_mlp": 1.050946, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.053195529027894116, + "language_loss": 0.85790342, + "learning_rate": 0.0006964388491261692, + "loss": 0.86868727, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.2746582, + "step": 2029, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082882, + "balance_loss_mlp": 1.0550828, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.06114884672927749, + "language_loss": 0.87352717, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435602, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.27832031, + "step": 2030, + "time_per_iteration": 2.8415944576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083514, + "balance_loss_mlp": 1.0548079, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.056999957489140544, + "language_loss": 0.78065526, + "learning_rate": 0.0006958657139041696, + "loss": 0.79149044, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.28686523, + "step": 2031, + "time_per_iteration": 2.750596761703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_mlp": 1.01660919, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.015090316928766313, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77740502, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.109375, + "step": 2032, + "time_per_iteration": 4.916932106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080774, + "balance_loss_mlp": 1.05371356, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.058882626995900515, + "language_loss": 0.77978921, + "learning_rate": 0.0006952922745149434, + "loss": 0.7905969, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.27099609, + "step": 2033, + "time_per_iteration": 2.6288254261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076329, + "balance_loss_mlp": 1.04802871, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.059683993490508125, + "language_loss": 0.8774389, + "learning_rate": 0.000695005441035888, + "loss": 0.88820225, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.28295898, + "step": 2034, + "time_per_iteration": 2.6451032161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021075, + "balance_loss_mlp": 1.01001287, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.012767183735830537, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74744511, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11083984, + "step": 2035, + "time_per_iteration": 4.875540018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05346835, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.05871453648610719, + "language_loss": 0.8120997, + "learning_rate": 0.0006944315470656863, + "loss": 0.82291067, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.27685547, + "step": 2036, + "time_per_iteration": 2.9991486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079422, + "balance_loss_mlp": 1.05193281, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05954449002694624, + "language_loss": 0.90806162, + "learning_rate": 0.000694144486797345, + "loss": 0.91885585, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.27539062, + "step": 2037, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016452, + "balance_loss_mlp": 1.00543678, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.010331538207496795, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80536884, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.11035156, + "step": 2038, + "time_per_iteration": 4.696615695953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077334, + "balance_loss_mlp": 1.04920101, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.05886678367995608, + "language_loss": 0.89078939, + "learning_rate": 0.0006935701402514156, + "loss": 0.90156269, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.28149414, + "step": 2039, + "time_per_iteration": 2.555340051651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00254571, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.009976601144167605, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74048454, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.11035156, + "step": 2040, + "time_per_iteration": 4.91499400138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04941869, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.0656092448350418, + "language_loss": 0.84421289, + "learning_rate": 0.0006929954931031422, + "loss": 0.8549906, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.28344727, + "step": 2041, + "time_per_iteration": 3.729060649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_mlp": 1.0521127, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05672023255092622, + "language_loss": 0.88579351, + "learning_rate": 0.0006927080570819805, + "loss": 0.8965857, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.27148438, + "step": 2042, + "time_per_iteration": 2.5964105129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05557048, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.07129276434353096, + "language_loss": 0.81115568, + "learning_rate": 0.0006924205462449161, + "loss": 0.82197881, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.26806641, + "step": 2043, + "time_per_iteration": 2.585873603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080679, + "balance_loss_mlp": 1.0537734, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.07610386660927036, + "language_loss": 0.8177464, + "learning_rate": 0.0006921329607035702, + "loss": 0.8285532, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.26940918, + "step": 2044, + "time_per_iteration": 3.238981246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087504, + "balance_loss_mlp": 1.0611347, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.0570655681013956, + "language_loss": 0.87757248, + "learning_rate": 0.0006918453005695938, + "loss": 0.88844752, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.26416016, + "step": 2045, + "time_per_iteration": 2.6602108478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_mlp": 1.06491971, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.055879562404771856, + "language_loss": 0.84307766, + "learning_rate": 0.0006915575659546662, + "loss": 0.85398793, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.26147461, + "step": 2046, + "time_per_iteration": 2.6592600345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_mlp": 1.06476951, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.06494345942268129, + "language_loss": 0.80426449, + "learning_rate": 0.0006912697569704959, + "loss": 0.81517833, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.26623535, + "step": 2047, + "time_per_iteration": 2.613070011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080678, + "balance_loss_mlp": 1.0539515, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.06871552578761372, + "language_loss": 0.86815077, + "learning_rate": 0.0006909818737288205, + "loss": 0.87895757, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.26745605, + "step": 2048, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05919969, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.055462609864315775, + "language_loss": 0.80754077, + "learning_rate": 0.000690693916341406, + "loss": 0.81840289, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.27075195, + "step": 2049, + "time_per_iteration": 2.668114185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_mlp": 1.0532347, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.05123788091691057, + "language_loss": 0.8241666, + "learning_rate": 0.0006904058849200475, + "loss": 0.83496863, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.27001953, + "step": 2050, + "time_per_iteration": 2.7161009311676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084281, + "balance_loss_mlp": 1.05679107, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.06391064418382593, + "language_loss": 0.84741384, + "learning_rate": 0.0006901177795765683, + "loss": 0.8582567, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.27514648, + "step": 2051, + "time_per_iteration": 2.6012356281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082278, + "balance_loss_mlp": 1.05540872, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.059538956745971455, + "language_loss": 0.8114661, + "learning_rate": 0.0006898296004228213, + "loss": 0.82228893, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.26879883, + "step": 2052, + "time_per_iteration": 2.739016056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091682, + "balance_loss_mlp": 1.07909358, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.0435951911950544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79218423, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12597656, + "step": 2053, + "time_per_iteration": 4.853093385696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.0498004, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.061585922129253, + "language_loss": 0.79790258, + "learning_rate": 0.0006892530211320763, + "loss": 0.80867237, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.2722168, + "step": 2054, + "time_per_iteration": 2.695810317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_mlp": 1.05135143, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06739666157176663, + "language_loss": 0.83483803, + "learning_rate": 0.000688964621218926, + "loss": 0.84561741, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.26611328, + "step": 2055, + "time_per_iteration": 2.5957767963409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04496288, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05900978816729325, + "language_loss": 0.79760778, + "learning_rate": 0.0006886761479432037, + "loss": 0.80831754, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.26037598, + "step": 2056, + "time_per_iteration": 2.823195457458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.0479672, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.06325658180551426, + "language_loss": 0.84495139, + "learning_rate": 0.0006883876014169045, + "loss": 0.85570216, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.27148438, + "step": 2057, + "time_per_iteration": 2.504899263381958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05080771, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05952155235087993, + "language_loss": 0.90666497, + "learning_rate": 0.000688098981752052, + "loss": 0.91744673, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.27441406, + "step": 2058, + "time_per_iteration": 2.705845832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079753, + "balance_loss_mlp": 1.05207229, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.057037005783434964, + "language_loss": 0.80068249, + "learning_rate": 0.0006878102890606982, + "loss": 0.81147999, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.27709961, + "step": 2059, + "time_per_iteration": 3.086745500564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108134, + "balance_loss_mlp": 1.0542556, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.07822530462482143, + "language_loss": 0.80866635, + "learning_rate": 0.0006875215234549239, + "loss": 0.8194797, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.27124023, + "step": 2060, + "time_per_iteration": 2.5814599990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080083, + "balance_loss_mlp": 1.05221188, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.06673254145899743, + "language_loss": 0.85142004, + "learning_rate": 0.0006872326850468376, + "loss": 0.86222088, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.27880859, + "step": 2061, + "time_per_iteration": 2.6693742275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081472, + "balance_loss_mlp": 1.05343366, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.06184749895138045, + "language_loss": 0.78875667, + "learning_rate": 0.0006869437739485762, + "loss": 0.79957139, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.28051758, + "step": 2062, + "time_per_iteration": 2.612020969390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108316, + "balance_loss_mlp": 1.05493176, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.07174128592683177, + "language_loss": 0.92295337, + "learning_rate": 0.0006866547902723053, + "loss": 0.93378496, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.2824707, + "step": 2063, + "time_per_iteration": 2.676013469696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_mlp": 1.05300224, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05898261192449876, + "language_loss": 0.80094039, + "learning_rate": 0.000686365734130218, + "loss": 0.81175387, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.28369141, + "step": 2064, + "time_per_iteration": 2.7021024227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071448, + "balance_loss_mlp": 1.0426228, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.09101918864834832, + "language_loss": 0.83948302, + "learning_rate": 0.000686076605634536, + "loss": 0.85019755, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.28808594, + "step": 2065, + "time_per_iteration": 2.6558356285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068247, + "balance_loss_mlp": 1.03963661, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.05840936356543045, + "language_loss": 0.83999312, + "learning_rate": 0.0006857874048975088, + "loss": 0.85067558, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.28613281, + "step": 2066, + "time_per_iteration": 2.556900978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068316, + "balance_loss_mlp": 1.04027796, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.07585091480167282, + "language_loss": 0.87176585, + "learning_rate": 0.0006854981320314142, + "loss": 0.88244903, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.28027344, + "step": 2067, + "time_per_iteration": 2.445798635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04426003, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.08763476788371415, + "language_loss": 0.86982906, + "learning_rate": 0.0006852087871485579, + "loss": 0.88055265, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.28125, + "step": 2068, + "time_per_iteration": 2.6390161514282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_mlp": 1.04861069, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.065510260101048, + "language_loss": 0.82088625, + "learning_rate": 0.0006849193703612735, + "loss": 0.83165061, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.27856445, + "step": 2069, + "time_per_iteration": 2.763023614883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_mlp": 1.04346275, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.058439166966186944, + "language_loss": 0.77565378, + "learning_rate": 0.0006846298817819225, + "loss": 0.78636372, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.27563477, + "step": 2070, + "time_per_iteration": 2.948054790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070331, + "balance_loss_mlp": 1.04296088, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.06370866866163034, + "language_loss": 0.80921137, + "learning_rate": 0.0006843403215228945, + "loss": 0.8199147, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.27392578, + "step": 2071, + "time_per_iteration": 2.440274953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075017, + "balance_loss_mlp": 1.04771829, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.05754797735781241, + "language_loss": 0.80491692, + "learning_rate": 0.0006840506896966065, + "loss": 0.81566709, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.2734375, + "step": 2072, + "time_per_iteration": 2.7141849994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076402, + "balance_loss_mlp": 1.04874492, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.06436648215160112, + "language_loss": 0.82351565, + "learning_rate": 0.0006837609864155038, + "loss": 0.83427966, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.27685547, + "step": 2073, + "time_per_iteration": 2.8728160858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107952, + "balance_loss_mlp": 1.05267441, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.06075069456973031, + "language_loss": 0.83255166, + "learning_rate": 0.0006834712117920592, + "loss": 0.84334683, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.26855469, + "step": 2074, + "time_per_iteration": 2.6078460216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081959, + "balance_loss_mlp": 1.05458879, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.08105254072349301, + "language_loss": 0.85028476, + "learning_rate": 0.0006831813659387729, + "loss": 0.86110437, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.27416992, + "step": 2075, + "time_per_iteration": 2.5435502529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080066, + "balance_loss_mlp": 1.05236197, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05543733258884828, + "language_loss": 0.84105802, + "learning_rate": 0.0006828914489681733, + "loss": 0.85185862, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.27758789, + "step": 2076, + "time_per_iteration": 2.716728687286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_mlp": 1.05186319, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05894989539880716, + "language_loss": 0.8515023, + "learning_rate": 0.0006826014609928162, + "loss": 0.86230129, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.28027344, + "step": 2077, + "time_per_iteration": 2.740797996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_mlp": 1.02490366, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.025465037646940157, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84235638, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.11328125, + "step": 2078, + "time_per_iteration": 4.832703590393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.05287147, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.11662193334808049, + "language_loss": 0.8017869, + "learning_rate": 0.0006820212724781896, + "loss": 0.81259406, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.27880859, + "step": 2079, + "time_per_iteration": 2.6742663383483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076717, + "balance_loss_mlp": 1.0488224, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.08177152300224107, + "language_loss": 0.83806193, + "learning_rate": 0.0006817310721641694, + "loss": 0.84882903, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.27905273, + "step": 2080, + "time_per_iteration": 2.8349008560180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076923, + "balance_loss_mlp": 1.04929078, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.06565277329590896, + "language_loss": 0.84214735, + "learning_rate": 0.00068144080129589, + "loss": 0.8529166, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.27685547, + "step": 2081, + "time_per_iteration": 2.6278159618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_mlp": 1.05710232, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05776018351639151, + "language_loss": 0.82856774, + "learning_rate": 0.0006811504599860441, + "loss": 0.83941126, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.27294922, + "step": 2082, + "time_per_iteration": 2.569265365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088899, + "balance_loss_mlp": 1.06140924, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.07401045054208001, + "language_loss": 0.85797036, + "learning_rate": 0.0006808600483473526, + "loss": 0.86885935, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.27490234, + "step": 2083, + "time_per_iteration": 2.8923354148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.05170512, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.06499053200862517, + "language_loss": 0.86023808, + "learning_rate": 0.0006805695664925629, + "loss": 0.87103558, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.28027344, + "step": 2084, + "time_per_iteration": 2.8025314807891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.05461943, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.06817943175075042, + "language_loss": 0.8386181, + "learning_rate": 0.0006802790145344506, + "loss": 0.84944773, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.28344727, + "step": 2085, + "time_per_iteration": 2.5035839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075393, + "balance_loss_mlp": 1.04725957, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.06401081868364573, + "language_loss": 0.87169802, + "learning_rate": 0.0006799883925858176, + "loss": 0.88245201, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.28125, + "step": 2086, + "time_per_iteration": 2.8827152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088527, + "balance_loss_mlp": 1.05989313, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06559731004413262, + "language_loss": 0.85316324, + "learning_rate": 0.0006796977007594933, + "loss": 0.86404848, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.28637695, + "step": 2087, + "time_per_iteration": 2.5959601402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094266, + "balance_loss_mlp": 1.06553721, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.12268552055269868, + "language_loss": 0.86342102, + "learning_rate": 0.0006794069391683345, + "loss": 0.87436372, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.28710938, + "step": 2088, + "time_per_iteration": 2.7393155097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089464, + "balance_loss_mlp": 1.06087732, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.0717880154934153, + "language_loss": 0.80560589, + "learning_rate": 0.0006791161079252248, + "loss": 0.81650054, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.28588867, + "step": 2089, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06879497, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.06954460778471602, + "language_loss": 0.8248291, + "learning_rate": 0.0006788252071430747, + "loss": 0.83581454, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.29711914, + "step": 2090, + "time_per_iteration": 2.682352304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_mlp": 1.07429934, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.07587120880411238, + "language_loss": 0.8680824, + "learning_rate": 0.0006785342369348222, + "loss": 0.87911433, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.28857422, + "step": 2091, + "time_per_iteration": 2.7333736419677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_mlp": 1.07579792, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.07069251800195664, + "language_loss": 0.7977879, + "learning_rate": 0.0006782431974134316, + "loss": 0.8088339, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.2878418, + "step": 2092, + "time_per_iteration": 2.541607141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105121, + "balance_loss_mlp": 1.0768441, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05426777537327344, + "language_loss": 0.89421535, + "learning_rate": 0.0006779520886918949, + "loss": 0.90526658, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.2824707, + "step": 2093, + "time_per_iteration": 3.035090684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_mlp": 1.07378376, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.07593649947233896, + "language_loss": 0.81461406, + "learning_rate": 0.0006776609108832301, + "loss": 0.82563823, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.28637695, + "step": 2094, + "time_per_iteration": 2.8035519123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102, + "balance_loss_mlp": 1.07398582, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.07164022458424311, + "language_loss": 0.85034972, + "learning_rate": 0.0006773696641004828, + "loss": 0.86136973, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.28027344, + "step": 2095, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.07147717, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.07309254376996902, + "language_loss": 0.77576917, + "learning_rate": 0.0006770783484567247, + "loss": 0.78676933, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.28515625, + "step": 2096, + "time_per_iteration": 3.1005897521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.06557441, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.04872529153034484, + "language_loss": 0.86118937, + "learning_rate": 0.000676786964065055, + "loss": 0.87212431, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.27978516, + "step": 2097, + "time_per_iteration": 2.78965163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093986, + "balance_loss_mlp": 1.06680584, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.06867709967223685, + "language_loss": 0.78839391, + "learning_rate": 0.0006764955110385986, + "loss": 0.79933375, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.2722168, + "step": 2098, + "time_per_iteration": 2.7579219341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.06361151, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.0577520756279271, + "language_loss": 0.80600876, + "learning_rate": 0.0006762039894905083, + "loss": 0.81691736, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.27294922, + "step": 2099, + "time_per_iteration": 2.632434129714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05595064, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06925599284799831, + "language_loss": 0.80233157, + "learning_rate": 0.000675912399533962, + "loss": 0.8131665, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.27563477, + "step": 2100, + "time_per_iteration": 2.521758556365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086411, + "balance_loss_mlp": 1.05947018, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.05734073179456058, + "language_loss": 0.84850854, + "learning_rate": 0.0006756207412821656, + "loss": 0.85937262, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.26977539, + "step": 2101, + "time_per_iteration": 3.043041944503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079398, + "balance_loss_mlp": 1.05245721, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.07220576126006613, + "language_loss": 0.80240154, + "learning_rate": 0.0006753290148483505, + "loss": 0.81319559, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.27001953, + "step": 2102, + "time_per_iteration": 3.0245606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_mlp": 1.05726886, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.06170005058098184, + "language_loss": 0.78875476, + "learning_rate": 0.0006750372203457752, + "loss": 0.79960519, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.27832031, + "step": 2103, + "time_per_iteration": 2.484698534011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078758, + "balance_loss_mlp": 1.05131626, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.05090920908511917, + "language_loss": 0.86534655, + "learning_rate": 0.0006747453578877242, + "loss": 0.87613416, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.27490234, + "step": 2104, + "time_per_iteration": 2.69670033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081019, + "balance_loss_mlp": 1.05281401, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.06546748387286302, + "language_loss": 0.8289392, + "learning_rate": 0.0006744534275875085, + "loss": 0.83974934, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.28222656, + "step": 2105, + "time_per_iteration": 2.9919168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.05620074, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.0635527467859112, + "language_loss": 0.8582921, + "learning_rate": 0.0006741614295584657, + "loss": 0.86912322, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.26977539, + "step": 2106, + "time_per_iteration": 2.6488401889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107849, + "balance_loss_mlp": 1.05073833, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.057690605181557136, + "language_loss": 0.78413224, + "learning_rate": 0.0006738693639139595, + "loss": 0.79491717, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.27807617, + "step": 2107, + "time_per_iteration": 2.9652647972106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078123, + "balance_loss_mlp": 1.05015635, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05945372540383898, + "language_loss": 0.77655667, + "learning_rate": 0.0006735772307673796, + "loss": 0.78733784, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.27978516, + "step": 2108, + "time_per_iteration": 3.5789337158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079955, + "balance_loss_mlp": 1.05222702, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.05752735064114104, + "language_loss": 0.83347392, + "learning_rate": 0.0006732850302321421, + "loss": 0.84427351, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.27783203, + "step": 2109, + "time_per_iteration": 2.869591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078846, + "balance_loss_mlp": 1.051476, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.06455621073123653, + "language_loss": 0.84327263, + "learning_rate": 0.00067299276242169, + "loss": 0.85406113, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.27441406, + "step": 2110, + "time_per_iteration": 2.673659563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.07071877, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.036236061846660186, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75464427, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.11523438, + "step": 2111, + "time_per_iteration": 4.886230230331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082274, + "balance_loss_mlp": 1.05490351, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05646906793429633, + "language_loss": 0.77664089, + "learning_rate": 0.0006724080254290395, + "loss": 0.78746361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.27416992, + "step": 2112, + "time_per_iteration": 2.8506221771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04847741, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.06356712121797842, + "language_loss": 0.89422435, + "learning_rate": 0.0006721155564738566, + "loss": 0.90498972, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.28100586, + "step": 2113, + "time_per_iteration": 2.673015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_mlp": 1.02626586, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.019828324636468348, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79660642, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.1171875, + "step": 2114, + "time_per_iteration": 5.003857851028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080097, + "balance_loss_mlp": 1.0521065, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07124796283110259, + "language_loss": 0.85397822, + "learning_rate": 0.0006715304182135078, + "loss": 0.86477917, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.2800293, + "step": 2115, + "time_per_iteration": 2.641721248626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.05418694, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.08996962933736626, + "language_loss": 0.88862896, + "learning_rate": 0.0006712377491355127, + "loss": 0.89945835, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.28735352, + "step": 2116, + "time_per_iteration": 2.880159616470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077208, + "balance_loss_mlp": 1.04857373, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.046629180459365246, + "language_loss": 0.81631374, + "learning_rate": 0.0006709450135771274, + "loss": 0.82708585, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.28637695, + "step": 2117, + "time_per_iteration": 2.9391822814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.04953849, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05926883506924263, + "language_loss": 0.86382973, + "learning_rate": 0.0006706522116520023, + "loss": 0.87459958, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.27490234, + "step": 2118, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_mlp": 1.05072808, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.06371775766221305, + "language_loss": 0.82902479, + "learning_rate": 0.0006703593434738127, + "loss": 0.83981442, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.28222656, + "step": 2119, + "time_per_iteration": 2.6982903480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080441, + "balance_loss_mlp": 1.05216455, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.05030428863920766, + "language_loss": 0.78137958, + "learning_rate": 0.0006700664091562604, + "loss": 0.792184, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.28271484, + "step": 2120, + "time_per_iteration": 2.5976343154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081224, + "balance_loss_mlp": 1.05259037, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.05481620044617693, + "language_loss": 0.85151196, + "learning_rate": 0.0006697734088130725, + "loss": 0.86232412, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.28637695, + "step": 2121, + "time_per_iteration": 2.613192558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_mlp": 1.05665159, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.0674188074849357, + "language_loss": 0.85445356, + "learning_rate": 0.0006694803425580018, + "loss": 0.86531019, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.28955078, + "step": 2122, + "time_per_iteration": 2.9808695316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_mlp": 1.05585766, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.06189748292204317, + "language_loss": 0.8466748, + "learning_rate": 0.0006691872105048268, + "loss": 0.85753286, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.29907227, + "step": 2123, + "time_per_iteration": 2.5712099075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_mlp": 1.05992901, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.06907127419859461, + "language_loss": 0.84616292, + "learning_rate": 0.0006688940127673513, + "loss": 0.85705543, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.29296875, + "step": 2124, + "time_per_iteration": 2.6865010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091737, + "balance_loss_mlp": 1.06181526, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.048409192362904495, + "language_loss": 0.85410631, + "learning_rate": 0.0006686007494594049, + "loss": 0.86502367, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.29882812, + "step": 2125, + "time_per_iteration": 2.8982856273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.06085694, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.07961338986962259, + "language_loss": 0.80014485, + "learning_rate": 0.0006683074206948425, + "loss": 0.81105095, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.29736328, + "step": 2126, + "time_per_iteration": 2.489884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086751, + "balance_loss_mlp": 1.05649602, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.06572114620312723, + "language_loss": 0.81335235, + "learning_rate": 0.0006680140265875443, + "loss": 0.82421982, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.30200195, + "step": 2127, + "time_per_iteration": 2.8000454902648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05512488, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.054748250322007024, + "language_loss": 0.95437354, + "learning_rate": 0.0006677205672514162, + "loss": 0.9652164, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.29125977, + "step": 2128, + "time_per_iteration": 2.6153228282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05600977, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.05206451104952603, + "language_loss": 0.88892365, + "learning_rate": 0.000667427042800389, + "loss": 0.89978707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.30273438, + "step": 2129, + "time_per_iteration": 2.772545337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080649, + "balance_loss_mlp": 1.0521338, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.06928662998118869, + "language_loss": 0.82843542, + "learning_rate": 0.0006671334533484192, + "loss": 0.83924192, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.28515625, + "step": 2130, + "time_per_iteration": 2.7501790523529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077969, + "balance_loss_mlp": 1.04938281, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.051614263088568736, + "language_loss": 0.83230782, + "learning_rate": 0.0006668397990094881, + "loss": 0.84308755, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.28613281, + "step": 2131, + "time_per_iteration": 2.7121975421905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083028, + "balance_loss_mlp": 1.05370235, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05828514658280376, + "language_loss": 0.84553468, + "learning_rate": 0.0006665460798976027, + "loss": 0.85636497, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.29296875, + "step": 2132, + "time_per_iteration": 2.7074639797210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082859, + "balance_loss_mlp": 1.05532122, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.06450815869750301, + "language_loss": 0.81324267, + "learning_rate": 0.0006662522961267947, + "loss": 0.82407123, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.27563477, + "step": 2133, + "time_per_iteration": 2.676886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.05555081, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.04843791936563358, + "language_loss": 0.87077558, + "learning_rate": 0.0006659584478111211, + "loss": 0.88161933, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.28833008, + "step": 2134, + "time_per_iteration": 2.8004117012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06910408, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.07835760686868988, + "language_loss": 0.82880664, + "learning_rate": 0.000665664535064664, + "loss": 0.83977091, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.2734375, + "step": 2135, + "time_per_iteration": 3.034134864807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_mlp": 1.07278681, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05799734322971953, + "language_loss": 0.82382762, + "learning_rate": 0.0006653705580015303, + "loss": 0.8348338, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.27819824, + "step": 2136, + "time_per_iteration": 2.719423770904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105373, + "balance_loss_mlp": 1.07747769, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.05212184008762054, + "language_loss": 0.863967, + "learning_rate": 0.0006650765167358523, + "loss": 0.87502074, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.27905273, + "step": 2137, + "time_per_iteration": 2.7973241806030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110879, + "balance_loss_mlp": 1.08089471, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.07588683613844963, + "language_loss": 0.89871359, + "learning_rate": 0.0006647824113817864, + "loss": 0.90980148, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.27929688, + "step": 2138, + "time_per_iteration": 2.520531177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114294, + "balance_loss_mlp": 1.08768606, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.055552110514209885, + "language_loss": 0.81525648, + "learning_rate": 0.000664488242053515, + "loss": 0.82639945, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.26660156, + "step": 2139, + "time_per_iteration": 2.7204349040985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099437, + "balance_loss_mlp": 1.0722574, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.05646005524415558, + "language_loss": 0.83858913, + "learning_rate": 0.0006641940088652445, + "loss": 0.84958351, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.27246094, + "step": 2140, + "time_per_iteration": 2.748011827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.07521284, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05970845599818087, + "language_loss": 0.81979877, + "learning_rate": 0.0006638997119312065, + "loss": 0.83081794, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.26757812, + "step": 2141, + "time_per_iteration": 2.723269462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.07826746, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.04300629071925061, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76154923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.13378906, + "step": 2142, + "time_per_iteration": 4.922248363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089912, + "balance_loss_mlp": 1.06239891, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06629114096949819, + "language_loss": 0.8462221, + "learning_rate": 0.000663310927282877, + "loss": 0.85712123, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.27563477, + "step": 2143, + "time_per_iteration": 2.8463313579559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06413746, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05519054049820913, + "language_loss": 0.86099815, + "learning_rate": 0.000663016439797172, + "loss": 0.87191272, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.2734375, + "step": 2144, + "time_per_iteration": 2.611057996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.05917096, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.07082455066013048, + "language_loss": 0.80582112, + "learning_rate": 0.0006627218890228724, + "loss": 0.81669062, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.27783203, + "step": 2145, + "time_per_iteration": 2.8047831058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.05859172, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.08398112437337095, + "language_loss": 0.83330071, + "learning_rate": 0.0006624272750743326, + "loss": 0.84417343, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.28637695, + "step": 2146, + "time_per_iteration": 2.9890313148498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081748, + "balance_loss_mlp": 1.05299461, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.12117217429962603, + "language_loss": 0.82466137, + "learning_rate": 0.0006621325980659322, + "loss": 0.83547878, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.2878418, + "step": 2147, + "time_per_iteration": 2.7945189476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.05475557, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.05729870278054163, + "language_loss": 0.81810451, + "learning_rate": 0.000661837858112075, + "loss": 0.82893538, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.28320312, + "step": 2148, + "time_per_iteration": 2.833590030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05102634, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.05837233957282785, + "language_loss": 0.88857764, + "learning_rate": 0.0006615430553271888, + "loss": 0.89937091, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.28344727, + "step": 2149, + "time_per_iteration": 2.75384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04603195, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.06498878822354702, + "language_loss": 0.85069597, + "learning_rate": 0.0006612481898257264, + "loss": 0.86143911, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.28295898, + "step": 2150, + "time_per_iteration": 2.8471391201019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.04901028, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.06146250241107021, + "language_loss": 0.85024071, + "learning_rate": 0.000660953261722165, + "loss": 0.8610152, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.28442383, + "step": 2151, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04643118, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.07635609550069686, + "language_loss": 0.82408941, + "learning_rate": 0.0006606582711310055, + "loss": 0.8348453, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.29150391, + "step": 2152, + "time_per_iteration": 2.707353353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079486, + "balance_loss_mlp": 1.05068457, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.05643811624839042, + "language_loss": 0.83234471, + "learning_rate": 0.0006603632181667736, + "loss": 0.84313959, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.2878418, + "step": 2153, + "time_per_iteration": 2.6824803352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_mlp": 1.02085698, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.02554992861291058, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79978293, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.14160156, + "step": 2154, + "time_per_iteration": 4.893488645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075294, + "balance_loss_mlp": 1.04625416, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.06235301652291857, + "language_loss": 0.81530857, + "learning_rate": 0.0006597729255773153, + "loss": 0.82606155, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.2902832, + "step": 2155, + "time_per_iteration": 2.526531934738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084546, + "balance_loss_mlp": 1.05519629, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.06680223734216864, + "language_loss": 0.82554018, + "learning_rate": 0.0006594776861812608, + "loss": 0.83638561, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.29321289, + "step": 2156, + "time_per_iteration": 2.669290065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_mlp": 1.05525446, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.05896575190253656, + "language_loss": 0.8669672, + "learning_rate": 0.0006591823848704776, + "loss": 0.87780631, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.28613281, + "step": 2157, + "time_per_iteration": 2.9277596473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081796, + "balance_loss_mlp": 1.05273294, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.07853922010281017, + "language_loss": 0.81488264, + "learning_rate": 0.0006588870217596117, + "loss": 0.82570058, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.29003906, + "step": 2158, + "time_per_iteration": 2.72590970993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107553, + "balance_loss_mlp": 1.04572749, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.06749140584983894, + "language_loss": 0.86219651, + "learning_rate": 0.0006585915969633334, + "loss": 0.87295187, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.29760742, + "step": 2159, + "time_per_iteration": 2.609668731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068571, + "balance_loss_mlp": 1.03838706, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.0643598430263329, + "language_loss": 0.89336061, + "learning_rate": 0.0006582961105963366, + "loss": 0.90404636, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.30151367, + "step": 2160, + "time_per_iteration": 2.814122200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04409909, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.0615363131016327, + "language_loss": 0.77864838, + "learning_rate": 0.0006580005627733395, + "loss": 0.78939116, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.30126953, + "step": 2161, + "time_per_iteration": 2.693002700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03790569, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.07091162327263066, + "language_loss": 0.81523043, + "learning_rate": 0.0006577049536090838, + "loss": 0.82590109, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.29125977, + "step": 2162, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.04039741, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07952336976051765, + "language_loss": 0.85617888, + "learning_rate": 0.000657409283218335, + "loss": 0.86688089, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.29760742, + "step": 2163, + "time_per_iteration": 2.663069486618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.04075933, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.06199265882265987, + "language_loss": 0.81197548, + "learning_rate": 0.0006571135517158829, + "loss": 0.82267773, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.29394531, + "step": 2164, + "time_per_iteration": 2.6750965118408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_mlp": 1.03042102, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.030179808177232596, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807546, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.13085938, + "step": 2165, + "time_per_iteration": 4.7519471645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.0417223, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.06526247046532782, + "language_loss": 0.83270538, + "learning_rate": 0.0006565219058351444, + "loss": 0.84342444, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.30151367, + "step": 2166, + "time_per_iteration": 2.5784192085266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.04080534, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.06219532105294632, + "language_loss": 0.82938039, + "learning_rate": 0.0006562259916865553, + "loss": 0.84009004, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.30102539, + "step": 2167, + "time_per_iteration": 2.59431791305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073926, + "balance_loss_mlp": 1.04369497, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.06573475594481314, + "language_loss": 0.7943427, + "learning_rate": 0.0006559300168856573, + "loss": 0.80508196, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.30175781, + "step": 2168, + "time_per_iteration": 2.727644443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070483, + "balance_loss_mlp": 1.04046655, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.17889612534981147, + "language_loss": 0.85705924, + "learning_rate": 0.0006556339815473577, + "loss": 0.86776412, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.29980469, + "step": 2169, + "time_per_iteration": 2.6300487518310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072561, + "balance_loss_mlp": 1.04366493, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.053042429294564375, + "language_loss": 0.86056256, + "learning_rate": 0.000655337885786588, + "loss": 0.87128818, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.2890625, + "step": 2170, + "time_per_iteration": 2.8887124061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081102, + "balance_loss_mlp": 1.05139482, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.08227745310603136, + "language_loss": 0.84896123, + "learning_rate": 0.0006550417297183025, + "loss": 0.85977226, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.29663086, + "step": 2171, + "time_per_iteration": 2.6285011768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088317, + "balance_loss_mlp": 1.05894339, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.05761128029173598, + "language_loss": 0.81863701, + "learning_rate": 0.0006547455134574793, + "loss": 0.82952011, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.29321289, + "step": 2172, + "time_per_iteration": 2.7729623317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.06040442, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.06792239619892874, + "language_loss": 0.83893955, + "learning_rate": 0.0006544492371191198, + "loss": 0.84983015, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.28613281, + "step": 2173, + "time_per_iteration": 3.1256158351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094435, + "balance_loss_mlp": 1.06477547, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.05504184984792058, + "language_loss": 0.83198339, + "learning_rate": 0.0006541529008182485, + "loss": 0.84292769, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.29638672, + "step": 2174, + "time_per_iteration": 3.207711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.0648396, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.07199426026259947, + "language_loss": 0.87529659, + "learning_rate": 0.0006538565046699136, + "loss": 0.88623327, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.28808594, + "step": 2175, + "time_per_iteration": 2.5804800987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090181, + "balance_loss_mlp": 1.06207108, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.06367136059390696, + "language_loss": 0.80982441, + "learning_rate": 0.0006535600487891862, + "loss": 0.82072628, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.28149414, + "step": 2176, + "time_per_iteration": 2.7804555892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087535, + "balance_loss_mlp": 1.05870986, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05631892460787088, + "language_loss": 0.89099276, + "learning_rate": 0.0006532635332911603, + "loss": 0.9018681, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.28808594, + "step": 2177, + "time_per_iteration": 2.641392707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083587, + "balance_loss_mlp": 1.05428553, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.06086903625614387, + "language_loss": 0.80636132, + "learning_rate": 0.0006529669582909541, + "loss": 0.8171972, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.29296875, + "step": 2178, + "time_per_iteration": 3.2258243560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079831, + "balance_loss_mlp": 1.0508393, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06798611784395944, + "language_loss": 0.85681045, + "learning_rate": 0.0006526703239037077, + "loss": 0.86760873, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.28955078, + "step": 2179, + "time_per_iteration": 2.66808819770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0480361, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.06231650691948033, + "language_loss": 0.86236274, + "learning_rate": 0.0006523736302445851, + "loss": 0.87313515, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.29174805, + "step": 2180, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04490554, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05646655403971755, + "language_loss": 0.77122605, + "learning_rate": 0.0006520768774287728, + "loss": 0.78197432, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.29882812, + "step": 2181, + "time_per_iteration": 3.7851996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077657, + "balance_loss_mlp": 1.04899919, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.05195874321999793, + "language_loss": 0.85622293, + "learning_rate": 0.0006517800655714806, + "loss": 0.86699945, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.28686523, + "step": 2182, + "time_per_iteration": 2.8000948429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.05359161, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.06393427474455515, + "language_loss": 0.85246432, + "learning_rate": 0.0006514831947879407, + "loss": 0.86329615, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.2956543, + "step": 2183, + "time_per_iteration": 2.946345329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090824, + "balance_loss_mlp": 1.06164193, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05990675678964555, + "language_loss": 0.78013611, + "learning_rate": 0.0006511862651934091, + "loss": 0.79104435, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.29174805, + "step": 2184, + "time_per_iteration": 3.043314218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087348, + "balance_loss_mlp": 1.05797458, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.05608517861748944, + "language_loss": 0.82263517, + "learning_rate": 0.0006508892769031638, + "loss": 0.83350861, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.29345703, + "step": 2185, + "time_per_iteration": 2.662071704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090134, + "balance_loss_mlp": 1.06052232, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.07931700187887496, + "language_loss": 0.86476076, + "learning_rate": 0.000650592230032506, + "loss": 0.87566209, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.2956543, + "step": 2186, + "time_per_iteration": 2.758989095687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094562, + "balance_loss_mlp": 1.06464052, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.06900651751722174, + "language_loss": 0.84912258, + "learning_rate": 0.0006502951246967595, + "loss": 0.8600682, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.29882812, + "step": 2187, + "time_per_iteration": 2.9305953979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.06274199, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.061550495040686125, + "language_loss": 0.86992055, + "learning_rate": 0.0006499979610112706, + "loss": 0.88084006, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.29150391, + "step": 2188, + "time_per_iteration": 2.6826889514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091259, + "balance_loss_mlp": 1.06205249, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05090003048385584, + "language_loss": 0.84021527, + "learning_rate": 0.000649700739091409, + "loss": 0.85112786, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.29125977, + "step": 2189, + "time_per_iteration": 2.7169277667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.04628468, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.03212522571547254, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74894285, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.1171875, + "step": 2190, + "time_per_iteration": 4.8044211864471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094227, + "balance_loss_mlp": 1.06645083, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.05853660814181512, + "language_loss": 0.85258055, + "learning_rate": 0.0006491061210101557, + "loss": 0.86352277, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.27832031, + "step": 2191, + "time_per_iteration": 2.6850759983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093463, + "balance_loss_mlp": 1.06554449, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.05791259848064641, + "language_loss": 0.84111977, + "learning_rate": 0.0006488087250796157, + "loss": 0.85205436, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.27905273, + "step": 2192, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099215, + "balance_loss_mlp": 1.07148743, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.0649444731235166, + "language_loss": 0.81518376, + "learning_rate": 0.0006485112713764049, + "loss": 0.82617593, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.27734375, + "step": 2193, + "time_per_iteration": 2.910949468612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102268, + "balance_loss_mlp": 1.07523096, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.07813881123096035, + "language_loss": 0.83433115, + "learning_rate": 0.0006482137600160051, + "loss": 0.84535384, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.27075195, + "step": 2194, + "time_per_iteration": 2.5086262226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096994, + "balance_loss_mlp": 1.06900394, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.07794223585413998, + "language_loss": 0.84987926, + "learning_rate": 0.0006479161911139206, + "loss": 0.86084926, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.2800293, + "step": 2195, + "time_per_iteration": 2.5875346660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109264, + "balance_loss_mlp": 1.06493604, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.07304716613473786, + "language_loss": 0.85472345, + "learning_rate": 0.0006476185647856778, + "loss": 0.86564982, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.27734375, + "step": 2196, + "time_per_iteration": 2.5596694946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083263, + "balance_loss_mlp": 1.05589223, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.0787732151202365, + "language_loss": 0.81599677, + "learning_rate": 0.0006473208811468255, + "loss": 0.82682943, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.27416992, + "step": 2197, + "time_per_iteration": 2.8756632804870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.05518579, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05582038208417147, + "language_loss": 0.84304923, + "learning_rate": 0.0006470231403129347, + "loss": 0.85387599, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.27490234, + "step": 2198, + "time_per_iteration": 2.6008548736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082097, + "balance_loss_mlp": 1.05444098, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.05486589756973033, + "language_loss": 0.81627637, + "learning_rate": 0.0006467253423995988, + "loss": 0.8270973, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.27685547, + "step": 2199, + "time_per_iteration": 2.8359298706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085734, + "balance_loss_mlp": 1.05788624, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.06443704109820439, + "language_loss": 0.79415488, + "learning_rate": 0.000646427487522433, + "loss": 0.80501223, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.27880859, + "step": 2200, + "time_per_iteration": 2.6884772777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089933, + "balance_loss_mlp": 1.06251502, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.06462007516901433, + "language_loss": 0.83460814, + "learning_rate": 0.0006461295757970749, + "loss": 0.8455075, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.27441406, + "step": 2201, + "time_per_iteration": 2.7960758209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_mlp": 1.07140875, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.08363319364773283, + "language_loss": 0.81312859, + "learning_rate": 0.0006458316073391839, + "loss": 0.82413375, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.29101562, + "step": 2202, + "time_per_iteration": 2.853297472000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096557, + "balance_loss_mlp": 1.06830478, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.0711769658628502, + "language_loss": 0.87750852, + "learning_rate": 0.0006455335822644422, + "loss": 0.88847411, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.28271484, + "step": 2203, + "time_per_iteration": 2.6077048778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110502, + "balance_loss_mlp": 1.07607579, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.061615225293076246, + "language_loss": 0.77729923, + "learning_rate": 0.0006452355006885527, + "loss": 0.78834939, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.28930664, + "step": 2204, + "time_per_iteration": 2.6517252922058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103628, + "balance_loss_mlp": 1.07442212, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.1220032897030914, + "language_loss": 0.86957574, + "learning_rate": 0.0006449373627272412, + "loss": 0.88061202, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.29199219, + "step": 2205, + "time_per_iteration": 2.7004148960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093739, + "balance_loss_mlp": 1.06515288, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.07705045910796138, + "language_loss": 0.82556224, + "learning_rate": 0.0006446391684962553, + "loss": 0.83649963, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.28588867, + "step": 2206, + "time_per_iteration": 2.6505441665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.05558801, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.0589868983385633, + "language_loss": 0.82958955, + "learning_rate": 0.000644340918111364, + "loss": 0.84042698, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.28149414, + "step": 2207, + "time_per_iteration": 2.6410183906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079008, + "balance_loss_mlp": 1.05011129, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05680611388250626, + "language_loss": 0.84805965, + "learning_rate": 0.0006440426116883585, + "loss": 0.8588497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.28857422, + "step": 2208, + "time_per_iteration": 2.5708625316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074083, + "balance_loss_mlp": 1.04478097, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.06224422813064936, + "language_loss": 0.86093891, + "learning_rate": 0.0006437442493430519, + "loss": 0.87167978, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.29248047, + "step": 2209, + "time_per_iteration": 2.70894718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074378, + "balance_loss_mlp": 1.04481411, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.07482969618411565, + "language_loss": 0.86115217, + "learning_rate": 0.000643445831191278, + "loss": 0.87189603, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.29492188, + "step": 2210, + "time_per_iteration": 2.924381971359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076507, + "balance_loss_mlp": 1.0465858, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.07331466132736943, + "language_loss": 0.81421846, + "learning_rate": 0.0006431473573488937, + "loss": 0.82498354, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.29882812, + "step": 2211, + "time_per_iteration": 2.7787976264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.04380631, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.07883329281510759, + "language_loss": 0.84917492, + "learning_rate": 0.0006428488279317765, + "loss": 0.85990787, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.29443359, + "step": 2212, + "time_per_iteration": 2.6664369106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070733, + "balance_loss_mlp": 1.04052496, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.06306745469338368, + "language_loss": 0.87706983, + "learning_rate": 0.0006425502430558259, + "loss": 0.88777709, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.30151367, + "step": 2213, + "time_per_iteration": 2.6229989528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04106641, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.0655798606724697, + "language_loss": 0.84705913, + "learning_rate": 0.0006422516028369628, + "loss": 0.8577702, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.30004883, + "step": 2214, + "time_per_iteration": 2.69012451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072564, + "balance_loss_mlp": 1.04197454, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.08051577462794157, + "language_loss": 0.83543354, + "learning_rate": 0.0006419529073911296, + "loss": 0.84615922, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.30541992, + "step": 2215, + "time_per_iteration": 2.873396873474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070818, + "balance_loss_mlp": 1.03987157, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05918367623789858, + "language_loss": 0.85362011, + "learning_rate": 0.0006416541568342901, + "loss": 0.86432827, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.30908203, + "step": 2216, + "time_per_iteration": 2.870213508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071511, + "balance_loss_mlp": 1.04161358, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.06028802274016953, + "language_loss": 0.8413707, + "learning_rate": 0.0006413553512824297, + "loss": 0.85208583, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.29858398, + "step": 2217, + "time_per_iteration": 2.7570102214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066011, + "balance_loss_mlp": 1.03599358, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.06136950817587928, + "language_loss": 0.8441695, + "learning_rate": 0.0006410564908515549, + "loss": 0.85482961, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.29980469, + "step": 2218, + "time_per_iteration": 2.634636878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.05945328981992575, + "language_loss": 0.85267186, + "learning_rate": 0.0006407575756576935, + "loss": 0.8633939, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.30957031, + "step": 2219, + "time_per_iteration": 2.7264437675476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076309, + "balance_loss_mlp": 1.04512346, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.08352776642532155, + "language_loss": 0.87413085, + "learning_rate": 0.0006404586058168951, + "loss": 0.88489389, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.31152344, + "step": 2220, + "time_per_iteration": 2.740231513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070252, + "balance_loss_mlp": 1.03906727, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.06337599132559579, + "language_loss": 0.86675316, + "learning_rate": 0.0006401595814452296, + "loss": 0.87745565, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.31152344, + "step": 2221, + "time_per_iteration": 2.595133066177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04316878, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05998559409639075, + "language_loss": 0.80837309, + "learning_rate": 0.000639860502658789, + "loss": 0.81910712, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.30224609, + "step": 2222, + "time_per_iteration": 2.6363143920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078431, + "balance_loss_mlp": 1.04805684, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.051235249414951084, + "language_loss": 0.85047621, + "learning_rate": 0.0006395613695736853, + "loss": 0.86126053, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.3034668, + "step": 2223, + "time_per_iteration": 2.719651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.0574553, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.14370485886555942, + "language_loss": 0.82013905, + "learning_rate": 0.0006392621823060529, + "loss": 0.83102709, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.31347656, + "step": 2224, + "time_per_iteration": 2.707019805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.04968464, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.06727581417341866, + "language_loss": 0.84405053, + "learning_rate": 0.0006389629409720465, + "loss": 0.85485303, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.30541992, + "step": 2225, + "time_per_iteration": 2.6877145767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04415512, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.06967859590672425, + "language_loss": 0.88595277, + "learning_rate": 0.0006386636456878417, + "loss": 0.89670026, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.30566406, + "step": 2226, + "time_per_iteration": 2.87302827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.04344106, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.07126154474787791, + "language_loss": 0.92022073, + "learning_rate": 0.0006383642965696353, + "loss": 0.93095744, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.30175781, + "step": 2227, + "time_per_iteration": 2.4469897747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.04492915, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06843530557124561, + "language_loss": 0.82703793, + "learning_rate": 0.000638064893733645, + "loss": 0.83779144, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.30371094, + "step": 2228, + "time_per_iteration": 2.7728607654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071747, + "balance_loss_mlp": 1.04256451, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.058089035035371744, + "language_loss": 0.89580554, + "learning_rate": 0.000637765437296109, + "loss": 0.90652299, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.29199219, + "step": 2229, + "time_per_iteration": 2.634521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04252505, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.07373798457938027, + "language_loss": 0.85480672, + "learning_rate": 0.000637465927373287, + "loss": 0.86553335, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.30126953, + "step": 2230, + "time_per_iteration": 2.6294057369232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082832, + "balance_loss_mlp": 1.05276728, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.08134114280474665, + "language_loss": 0.79152465, + "learning_rate": 0.000637166364081459, + "loss": 0.80235291, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.30004883, + "step": 2231, + "time_per_iteration": 2.651043176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.04837155, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.0656552791827552, + "language_loss": 0.83965945, + "learning_rate": 0.0006368667475369256, + "loss": 0.85042852, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.28515625, + "step": 2232, + "time_per_iteration": 2.749769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072336, + "balance_loss_mlp": 1.05898428, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.038311067760931045, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79600114, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.13378906, + "step": 2233, + "time_per_iteration": 4.919846773147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_mlp": 1.04044378, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.026216416348918452, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79949123, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.1328125, + "step": 2234, + "time_per_iteration": 4.814115285873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109183, + "balance_loss_mlp": 1.06281483, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.052673535005773216, + "language_loss": 0.85474288, + "learning_rate": 0.0006359675795504112, + "loss": 0.86566114, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.29003906, + "step": 2235, + "time_per_iteration": 2.7002832889556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097467, + "balance_loss_mlp": 1.07021558, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.08125384058814748, + "language_loss": 0.74334383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75431848, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.27294922, + "step": 2236, + "time_per_iteration": 3.472095012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096497, + "balance_loss_mlp": 1.06938839, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.06719636161557083, + "language_loss": 0.85933757, + "learning_rate": 0.0006353678700956511, + "loss": 0.8703025, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.27148438, + "step": 2237, + "time_per_iteration": 2.6188535690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089994, + "balance_loss_mlp": 1.06288612, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.09054713742221257, + "language_loss": 0.83597302, + "learning_rate": 0.0006350679364783569, + "loss": 0.84687304, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.27172852, + "step": 2238, + "time_per_iteration": 2.7403035163879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093799, + "balance_loss_mlp": 1.0661664, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.06694912929746479, + "language_loss": 0.85728157, + "learning_rate": 0.0006347679504230393, + "loss": 0.86821961, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.27661133, + "step": 2239, + "time_per_iteration": 2.652348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087161, + "balance_loss_mlp": 1.05974269, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.056527008755361936, + "language_loss": 0.75895661, + "learning_rate": 0.0006344679120461632, + "loss": 0.7698282, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.27416992, + "step": 2240, + "time_per_iteration": 3.334127187728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091078, + "balance_loss_mlp": 1.06435084, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.1917370324350853, + "language_loss": 0.80061769, + "learning_rate": 0.0006341678214642134, + "loss": 0.81152856, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.26782227, + "step": 2241, + "time_per_iteration": 2.6100823879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087616, + "balance_loss_mlp": 1.06103277, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06088249389193946, + "language_loss": 0.82893783, + "learning_rate": 0.0006338676787936963, + "loss": 0.83981395, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.26635742, + "step": 2242, + "time_per_iteration": 3.077916383743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.07142353, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.060062439107852666, + "language_loss": 0.8377043, + "learning_rate": 0.0006335674841511367, + "loss": 0.84868383, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.26586914, + "step": 2243, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05415499, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03077915513708162, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80247629, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.12255859, + "step": 2244, + "time_per_iteration": 5.000265121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060995, + "balance_loss_mlp": 1.04878819, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.03064763148494063, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7842654, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.12207031, + "step": 2245, + "time_per_iteration": 4.9160850048065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093506, + "balance_loss_mlp": 1.06594431, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.06803490831657065, + "language_loss": 0.82597309, + "learning_rate": 0.0006326665895567652, + "loss": 0.83690816, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.2755127, + "step": 2246, + "time_per_iteration": 2.6449503898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.05649078, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.07553831830843152, + "language_loss": 0.87537026, + "learning_rate": 0.0006323661881916976, + "loss": 0.88621694, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.28173828, + "step": 2247, + "time_per_iteration": 2.699899911880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088894, + "balance_loss_mlp": 1.05983043, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.05605692822142187, + "language_loss": 0.80999863, + "learning_rate": 0.0006320657354375179, + "loss": 0.82088757, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.2902832, + "step": 2248, + "time_per_iteration": 2.9737963676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.05374026, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.1777496827938913, + "language_loss": 0.87151104, + "learning_rate": 0.0006317652314108726, + "loss": 0.88234049, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.29150391, + "step": 2249, + "time_per_iteration": 2.5640759468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076296, + "balance_loss_mlp": 1.04782867, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.059764616303547735, + "language_loss": 0.91275859, + "learning_rate": 0.0006314646762284277, + "loss": 0.92352152, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.28442383, + "step": 2250, + "time_per_iteration": 2.6878976821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056511, + "balance_loss_mlp": 1.04401791, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.026928771485436313, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76482344, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.125, + "step": 2251, + "time_per_iteration": 4.839360475540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079121, + "balance_loss_mlp": 1.04931927, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.05685438588579276, + "language_loss": 0.77368456, + "learning_rate": 0.0006308634128629022, + "loss": 0.78447574, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.29785156, + "step": 2252, + "time_per_iteration": 2.895348072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_mlp": 1.05426395, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.07214959985253801, + "language_loss": 0.87411779, + "learning_rate": 0.0006305627049132531, + "loss": 0.88495201, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.29125977, + "step": 2253, + "time_per_iteration": 2.8069100379943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.05440617, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.059293193490882155, + "language_loss": 0.85926008, + "learning_rate": 0.0006302619462746662, + "loss": 0.87009549, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.29101562, + "step": 2254, + "time_per_iteration": 3.1606533527374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.05193734, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05505451724174187, + "language_loss": 0.89697909, + "learning_rate": 0.0006299611370639069, + "loss": 0.90777981, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.28149414, + "step": 2255, + "time_per_iteration": 2.734578847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05368638, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.06498253441528982, + "language_loss": 0.79077351, + "learning_rate": 0.0006296602773977593, + "loss": 0.80159676, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.28637695, + "step": 2256, + "time_per_iteration": 2.7210190296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_mlp": 1.0577755, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.06552918038966793, + "language_loss": 0.87430996, + "learning_rate": 0.0006293593673930277, + "loss": 0.88517857, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.2902832, + "step": 2257, + "time_per_iteration": 2.6526098251342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087005, + "balance_loss_mlp": 1.05851448, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.06677812911461618, + "language_loss": 0.78416431, + "learning_rate": 0.0006290584071665358, + "loss": 0.79503441, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.28491211, + "step": 2258, + "time_per_iteration": 2.915259838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_mlp": 1.0575645, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.06990053073214272, + "language_loss": 0.81982124, + "learning_rate": 0.0006287573968351266, + "loss": 0.83068204, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.28515625, + "step": 2259, + "time_per_iteration": 2.5836570262908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082362, + "balance_loss_mlp": 1.05432403, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06494033905479386, + "language_loss": 0.82220829, + "learning_rate": 0.0006284563365156626, + "loss": 0.83303189, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.28076172, + "step": 2260, + "time_per_iteration": 2.815223217010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_mlp": 1.05620956, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.07047722124208498, + "language_loss": 0.87564874, + "learning_rate": 0.0006281552263250261, + "loss": 0.88649434, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.28344727, + "step": 2261, + "time_per_iteration": 2.4715116024017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_mlp": 1.04964256, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.023387556142435376, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81753576, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.12402344, + "step": 2262, + "time_per_iteration": 4.811767101287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084425, + "balance_loss_mlp": 1.05641103, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.062970719214795, + "language_loss": 0.81474411, + "learning_rate": 0.0006275528567978593, + "loss": 0.82558835, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.28051758, + "step": 2263, + "time_per_iteration": 2.9182233810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096573, + "balance_loss_mlp": 1.06877375, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.06472545743832298, + "language_loss": 0.82352197, + "learning_rate": 0.0006272515976951898, + "loss": 0.83448768, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.27832031, + "step": 2264, + "time_per_iteration": 3.137770175933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097325, + "balance_loss_mlp": 1.06852436, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.055887733519337984, + "language_loss": 0.79332447, + "learning_rate": 0.0006269502891890687, + "loss": 0.8042978, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.28759766, + "step": 2265, + "time_per_iteration": 2.9932398796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093111, + "balance_loss_mlp": 1.06526363, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.06217907852457908, + "language_loss": 0.87852293, + "learning_rate": 0.0006266489313964743, + "loss": 0.88945401, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.27880859, + "step": 2266, + "time_per_iteration": 2.720874547958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.06338787, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.05517220152754215, + "language_loss": 0.85363281, + "learning_rate": 0.0006263475244344041, + "loss": 0.86454159, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.27514648, + "step": 2267, + "time_per_iteration": 2.8508987426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089804, + "balance_loss_mlp": 1.06178975, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.061658084399303315, + "language_loss": 0.84817886, + "learning_rate": 0.0006260460684198746, + "loss": 0.85907692, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.28027344, + "step": 2268, + "time_per_iteration": 2.6972851753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091639, + "balance_loss_mlp": 1.06395864, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.07163404822705746, + "language_loss": 0.84593827, + "learning_rate": 0.0006257445634699213, + "loss": 0.85685468, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.27734375, + "step": 2269, + "time_per_iteration": 2.562509298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05565524, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.07106993063326117, + "language_loss": 0.82829607, + "learning_rate": 0.0006254430097015993, + "loss": 0.8391344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.28222656, + "step": 2270, + "time_per_iteration": 2.6713523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054528, + "balance_loss_mlp": 1.04203498, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029151500829202304, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77533615, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.125, + "step": 2271, + "time_per_iteration": 4.761755466461182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_mlp": 1.05801725, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05590316940209524, + "language_loss": 0.85155964, + "learning_rate": 0.0006248397561781609, + "loss": 0.86242455, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.28491211, + "step": 2272, + "time_per_iteration": 2.8541359901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091334, + "balance_loss_mlp": 1.06246173, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.07335127222093174, + "language_loss": 0.8601104, + "learning_rate": 0.0006245380566572482, + "loss": 0.87102377, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.28857422, + "step": 2273, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090326, + "balance_loss_mlp": 1.06200182, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06592567136619501, + "language_loss": 0.76039565, + "learning_rate": 0.0006242363087863744, + "loss": 0.77129889, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.28344727, + "step": 2274, + "time_per_iteration": 2.9512767791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089474, + "balance_loss_mlp": 1.06129336, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.07045204489750885, + "language_loss": 0.86392975, + "learning_rate": 0.0006239345126826878, + "loss": 0.87482452, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.28198242, + "step": 2275, + "time_per_iteration": 2.818574905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081719, + "balance_loss_mlp": 1.05236995, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.06271142699552738, + "language_loss": 0.8405596, + "learning_rate": 0.0006236326684633561, + "loss": 0.85137677, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.29296875, + "step": 2276, + "time_per_iteration": 2.8501060009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088499, + "balance_loss_mlp": 1.05972195, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.08224081940065299, + "language_loss": 0.75057948, + "learning_rate": 0.0006233307762455658, + "loss": 0.76146448, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.28735352, + "step": 2277, + "time_per_iteration": 2.6692187786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079787, + "balance_loss_mlp": 1.05098617, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.1351794781054828, + "language_loss": 0.83103114, + "learning_rate": 0.0006230288361465216, + "loss": 0.84182906, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.2878418, + "step": 2278, + "time_per_iteration": 3.0566518306732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081672, + "balance_loss_mlp": 1.05389631, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0635725084076576, + "language_loss": 0.85047072, + "learning_rate": 0.0006227268482834473, + "loss": 0.86128747, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.27783203, + "step": 2279, + "time_per_iteration": 2.890195608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086149, + "balance_loss_mlp": 1.05811095, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06574285370830908, + "language_loss": 0.87371957, + "learning_rate": 0.000622424812773585, + "loss": 0.88458109, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.28076172, + "step": 2280, + "time_per_iteration": 2.820857524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_mlp": 1.05698299, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.08150674529849485, + "language_loss": 0.80050623, + "learning_rate": 0.000622122729734195, + "loss": 0.81135261, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.27685547, + "step": 2281, + "time_per_iteration": 2.5578882694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090722, + "balance_loss_mlp": 1.06320858, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.05652917217777931, + "language_loss": 0.87423271, + "learning_rate": 0.0006218205992825566, + "loss": 0.88513994, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.27539062, + "step": 2282, + "time_per_iteration": 2.6367194652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05989254, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.06387466426791162, + "language_loss": 0.81580615, + "learning_rate": 0.0006215184215359671, + "loss": 0.82668239, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.27758789, + "step": 2283, + "time_per_iteration": 2.7550642490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109022, + "balance_loss_mlp": 1.06254005, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.06853375358246538, + "language_loss": 0.86762869, + "learning_rate": 0.0006212161966117425, + "loss": 0.87853086, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.27709961, + "step": 2284, + "time_per_iteration": 2.7315139770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093132, + "balance_loss_mlp": 1.06492722, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.06833018750237568, + "language_loss": 0.81347001, + "learning_rate": 0.0006209139246272164, + "loss": 0.82440132, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.28222656, + "step": 2285, + "time_per_iteration": 2.997727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085597, + "balance_loss_mlp": 1.0573678, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.0627571888999813, + "language_loss": 0.81454128, + "learning_rate": 0.0006206116056997421, + "loss": 0.82539719, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.28271484, + "step": 2286, + "time_per_iteration": 2.5523786544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092851, + "balance_loss_mlp": 1.06512272, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.0569936252584843, + "language_loss": 0.82580131, + "learning_rate": 0.0006203092399466892, + "loss": 0.83672982, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.27783203, + "step": 2287, + "time_per_iteration": 2.5256903171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080971, + "balance_loss_mlp": 1.05317175, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.052620788715243595, + "language_loss": 0.85130596, + "learning_rate": 0.0006200068274854473, + "loss": 0.86211562, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.27832031, + "step": 2288, + "time_per_iteration": 2.6666431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089786, + "balance_loss_mlp": 1.06108057, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.05493211856459023, + "language_loss": 0.85969126, + "learning_rate": 0.0006197043684334229, + "loss": 0.87058908, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.28686523, + "step": 2289, + "time_per_iteration": 2.7558815479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093604, + "balance_loss_mlp": 1.0652802, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.06713172204070075, + "language_loss": 0.7966578, + "learning_rate": 0.0006194018629080411, + "loss": 0.80759388, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.28344727, + "step": 2290, + "time_per_iteration": 2.7641310691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095567, + "balance_loss_mlp": 1.06721866, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.06308142018549157, + "language_loss": 0.81759441, + "learning_rate": 0.0006190993110267451, + "loss": 0.8285501, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.28393555, + "step": 2291, + "time_per_iteration": 2.759451389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087327, + "balance_loss_mlp": 1.05959892, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.0663089643389441, + "language_loss": 0.84395695, + "learning_rate": 0.0006187967129069958, + "loss": 0.85483021, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.27758789, + "step": 2292, + "time_per_iteration": 2.5458216667175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.06011844, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05260179709926624, + "language_loss": 0.8707509, + "learning_rate": 0.0006184940686662722, + "loss": 0.88162768, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.27612305, + "step": 2293, + "time_per_iteration": 2.7694880962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05494058, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.055518519655343164, + "language_loss": 0.90020764, + "learning_rate": 0.0006181913784220714, + "loss": 0.91103435, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.27758789, + "step": 2294, + "time_per_iteration": 2.6642205715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.03542924, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.024577707308588242, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81601226, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.12011719, + "step": 2295, + "time_per_iteration": 4.874637842178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084239, + "balance_loss_mlp": 1.05665421, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06513424306559527, + "language_loss": 0.79833972, + "learning_rate": 0.0006175858603933146, + "loss": 0.80918217, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.27612305, + "step": 2296, + "time_per_iteration": 2.9130241870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084408, + "balance_loss_mlp": 1.05665636, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.06251545633736988, + "language_loss": 0.80774343, + "learning_rate": 0.0006172830328438416, + "loss": 0.81858754, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.27783203, + "step": 2297, + "time_per_iteration": 2.953983783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.05460715, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.057534365085963636, + "language_loss": 0.86889625, + "learning_rate": 0.0006169801597610572, + "loss": 0.87972271, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.28051758, + "step": 2298, + "time_per_iteration": 2.7841529846191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087234, + "balance_loss_mlp": 1.05986333, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.0717755554401909, + "language_loss": 0.89631718, + "learning_rate": 0.0006166772412625469, + "loss": 0.90718955, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.27416992, + "step": 2299, + "time_per_iteration": 2.7750232219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087463, + "balance_loss_mlp": 1.05983019, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06473860012868299, + "language_loss": 0.81551421, + "learning_rate": 0.0006163742774659141, + "loss": 0.82638884, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.27661133, + "step": 2300, + "time_per_iteration": 2.8384482860565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092146, + "balance_loss_mlp": 1.06446528, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.0850959758091444, + "language_loss": 0.85627389, + "learning_rate": 0.0006160712684887801, + "loss": 0.86719531, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.27709961, + "step": 2301, + "time_per_iteration": 2.7603278160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_mlp": 1.05813527, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.053898588417471735, + "language_loss": 0.81867981, + "learning_rate": 0.0006157682144487832, + "loss": 0.82952744, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.2668457, + "step": 2302, + "time_per_iteration": 2.7585275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090771, + "balance_loss_mlp": 1.06347191, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.05970343490953875, + "language_loss": 0.82821, + "learning_rate": 0.0006154651154635793, + "loss": 0.83911771, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.2734375, + "step": 2303, + "time_per_iteration": 4.252831697463989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097367, + "balance_loss_mlp": 1.07040215, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05697892496442649, + "language_loss": 0.8468399, + "learning_rate": 0.0006151619716508421, + "loss": 0.85781354, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.27026367, + "step": 2304, + "time_per_iteration": 2.5882937908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_mlp": 1.07442617, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.06572201075979017, + "language_loss": 0.86751652, + "learning_rate": 0.0006148587831282625, + "loss": 0.87853855, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.27807617, + "step": 2305, + "time_per_iteration": 2.6605563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.04066956, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.01894914693526954, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.802288, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12060547, + "step": 2306, + "time_per_iteration": 4.910472631454468 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_mlp": 1.07342601, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06457533715620843, + "language_loss": 0.87372738, + "learning_rate": 0.0006142522724244255, + "loss": 0.88474846, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.28686523, + "step": 2307, + "time_per_iteration": 2.5184578895568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_mlp": 1.03508484, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.015440750347127817, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.7753191, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12109375, + "step": 2308, + "time_per_iteration": 4.880531549453735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_mlp": 1.07668638, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.0625118895390298, + "language_loss": 0.77304882, + "learning_rate": 0.000613645584293942, + "loss": 0.78409487, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.27954102, + "step": 2309, + "time_per_iteration": 2.888929605484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_mlp": 1.07522511, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.05626484670913178, + "language_loss": 0.82863319, + "learning_rate": 0.0006133421739881185, + "loss": 0.83965981, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.27441406, + "step": 2310, + "time_per_iteration": 2.6770823001861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098373, + "balance_loss_mlp": 1.06966734, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.09114290921538859, + "language_loss": 0.82713985, + "learning_rate": 0.0006130387196789605, + "loss": 0.83812356, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.28686523, + "step": 2311, + "time_per_iteration": 2.7363758087158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_mlp": 1.07309198, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.05056880651601303, + "language_loss": 0.84359384, + "learning_rate": 0.0006127352214842795, + "loss": 0.85461748, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.29272461, + "step": 2312, + "time_per_iteration": 3.0277068614959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.06688845, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.06767648502511064, + "language_loss": 0.85424733, + "learning_rate": 0.0006124316795219041, + "loss": 0.8652035, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.28710938, + "step": 2313, + "time_per_iteration": 2.7824032306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.05996561, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.06031488841862457, + "language_loss": 0.8232829, + "learning_rate": 0.0006121280939096794, + "loss": 0.83416176, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.27905273, + "step": 2314, + "time_per_iteration": 2.7414164543151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05901051, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.056993316738708576, + "language_loss": 0.8765316, + "learning_rate": 0.000611824464765468, + "loss": 0.88740778, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.28613281, + "step": 2315, + "time_per_iteration": 2.5894503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020326, + "balance_loss_mlp": 1.00830936, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.018109298143921163, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79615265, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.12011719, + "step": 2316, + "time_per_iteration": 4.654959201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081165, + "balance_loss_mlp": 1.05322254, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.05658516719934989, + "language_loss": 0.85440743, + "learning_rate": 0.000611217076352619, + "loss": 0.86521906, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.27978516, + "step": 2317, + "time_per_iteration": 2.8710198402404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086137, + "balance_loss_mlp": 1.05862343, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.062250172980488426, + "language_loss": 0.82876933, + "learning_rate": 0.0006109133173197905, + "loss": 0.8396306, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.27539062, + "step": 2318, + "time_per_iteration": 2.7298824787139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05986071, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.0706297628000491, + "language_loss": 0.85633492, + "learning_rate": 0.0006106095152265935, + "loss": 0.8672179, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.28466797, + "step": 2319, + "time_per_iteration": 2.8895695209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108895, + "balance_loss_mlp": 1.06086433, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.04876785494191262, + "language_loss": 0.84747481, + "learning_rate": 0.0006103056701909739, + "loss": 0.85836434, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.28125, + "step": 2320, + "time_per_iteration": 2.9117228984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108858, + "balance_loss_mlp": 1.05935025, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.06765559983355682, + "language_loss": 0.82841372, + "learning_rate": 0.0006100017823308956, + "loss": 0.8392995, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.29199219, + "step": 2321, + "time_per_iteration": 3.19189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095794, + "balance_loss_mlp": 1.06618226, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.07493928757304909, + "language_loss": 0.796121, + "learning_rate": 0.0006096978517643377, + "loss": 0.80707896, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.29589844, + "step": 2322, + "time_per_iteration": 2.7803642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088319, + "balance_loss_mlp": 1.05825448, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.05979787162997368, + "language_loss": 0.83128643, + "learning_rate": 0.0006093938786092968, + "loss": 0.84216964, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.30029297, + "step": 2323, + "time_per_iteration": 2.6324985027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_mlp": 1.05403399, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0696967897289199, + "language_loss": 0.89752465, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836924, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.30395508, + "step": 2324, + "time_per_iteration": 2.833986282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.05073011, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05715713314103227, + "language_loss": 0.87296605, + "learning_rate": 0.0006087858050058337, + "loss": 0.88377976, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3059082, + "step": 2325, + "time_per_iteration": 2.8220982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082075, + "balance_loss_mlp": 1.05084252, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06405768205874736, + "language_loss": 0.82704103, + "learning_rate": 0.0006084817047934866, + "loss": 0.83786178, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.31225586, + "step": 2326, + "time_per_iteration": 2.6844918727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077775, + "balance_loss_mlp": 1.04635119, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.06718825176833507, + "language_loss": 0.89515507, + "learning_rate": 0.0006081775624648066, + "loss": 0.90593284, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.31396484, + "step": 2327, + "time_per_iteration": 2.5115904808044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.04937041, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.06388622036462539, + "language_loss": 0.82659936, + "learning_rate": 0.0006078733781378721, + "loss": 0.83740276, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.30957031, + "step": 2328, + "time_per_iteration": 2.5578174591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.04003251, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05909371510774122, + "language_loss": 0.82426572, + "learning_rate": 0.0006075691519307781, + "loss": 0.83497119, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.3046875, + "step": 2329, + "time_per_iteration": 2.9271137714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04025745, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.0899878860138525, + "language_loss": 0.81604564, + "learning_rate": 0.0006072648839616356, + "loss": 0.8267594, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.31103516, + "step": 2330, + "time_per_iteration": 2.642164945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069213, + "balance_loss_mlp": 1.03805184, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.05660389796161562, + "language_loss": 0.82544589, + "learning_rate": 0.0006069605743485718, + "loss": 0.83613807, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3112793, + "step": 2331, + "time_per_iteration": 3.3559155464172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_mlp": 1.04945791, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.06166347857347268, + "language_loss": 0.83528912, + "learning_rate": 0.0006066562232097303, + "loss": 0.84607553, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.29125977, + "step": 2332, + "time_per_iteration": 2.7531135082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107678, + "balance_loss_mlp": 1.0468111, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.0526351904833897, + "language_loss": 0.86127633, + "learning_rate": 0.0006063518306632708, + "loss": 0.87204421, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.29907227, + "step": 2333, + "time_per_iteration": 2.957057476043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080344, + "balance_loss_mlp": 1.05044627, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.07121293699241546, + "language_loss": 0.82098341, + "learning_rate": 0.0006060473968273688, + "loss": 0.83178687, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.29882812, + "step": 2334, + "time_per_iteration": 2.687427043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050724, + "balance_loss_mlp": 1.03756309, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.03308553204338399, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78930265, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.13183594, + "step": 2335, + "time_per_iteration": 4.873494625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_mlp": 1.01476717, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.020404135430742085, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82032573, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.12597656, + "step": 2336, + "time_per_iteration": 4.8493242263793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091959, + "balance_loss_mlp": 1.06327689, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.08823378464345366, + "language_loss": 0.8815735, + "learning_rate": 0.0006051338487650047, + "loss": 0.89249313, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.28686523, + "step": 2337, + "time_per_iteration": 2.4994585514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094323, + "balance_loss_mlp": 1.06595135, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058014135330130424, + "language_loss": 0.82146972, + "learning_rate": 0.0006048292509534095, + "loss": 0.83241296, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.28344727, + "step": 2338, + "time_per_iteration": 2.6184592247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_mlp": 1.07211113, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.056454767026620875, + "language_loss": 0.77617335, + "learning_rate": 0.0006045246124434895, + "loss": 0.78716958, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.27539062, + "step": 2339, + "time_per_iteration": 2.7225115299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_mlp": 1.07309031, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.09896135571333878, + "language_loss": 0.86173731, + "learning_rate": 0.0006042199333535162, + "loss": 0.87274528, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.27709961, + "step": 2340, + "time_per_iteration": 3.274585008621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104864, + "balance_loss_mlp": 1.07768369, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05749680267159243, + "language_loss": 0.84251344, + "learning_rate": 0.0006039152138017763, + "loss": 0.85356206, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.27246094, + "step": 2341, + "time_per_iteration": 3.060763359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07796395, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.056134576893582644, + "language_loss": 0.83558077, + "learning_rate": 0.0006036104539065726, + "loss": 0.84663171, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.27172852, + "step": 2342, + "time_per_iteration": 2.7406816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108201, + "balance_loss_mlp": 1.08054459, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.061859527889038764, + "language_loss": 0.84472108, + "learning_rate": 0.000603305653786223, + "loss": 0.85580313, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.27685547, + "step": 2343, + "time_per_iteration": 3.197312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_mlp": 1.07354283, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.054371913691722666, + "language_loss": 0.83979696, + "learning_rate": 0.0006030008135590622, + "loss": 0.85080612, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.27416992, + "step": 2344, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097762, + "balance_loss_mlp": 1.07062995, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.05301123134364682, + "language_loss": 0.8020395, + "learning_rate": 0.0006026959333434387, + "loss": 0.81301707, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.27172852, + "step": 2345, + "time_per_iteration": 2.7582781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.0720278, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.056237590740745906, + "language_loss": 0.77273649, + "learning_rate": 0.0006023910132577181, + "loss": 0.78373116, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.2746582, + "step": 2346, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086046, + "balance_loss_mlp": 1.05915189, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.061957652789735564, + "language_loss": 0.84835315, + "learning_rate": 0.0006020860534202806, + "loss": 0.85921359, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.26953125, + "step": 2347, + "time_per_iteration": 2.5046098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010926, + "balance_loss_mlp": 1.06475294, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.05205934628014934, + "language_loss": 0.80817962, + "learning_rate": 0.0006017810539495224, + "loss": 0.81910563, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.27905273, + "step": 2348, + "time_per_iteration": 2.9269816875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.06642056, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.0701488599790333, + "language_loss": 0.82789373, + "learning_rate": 0.0006014760149638547, + "loss": 0.83883661, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.27880859, + "step": 2349, + "time_per_iteration": 2.725395441055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.05837011, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.05676126010630497, + "language_loss": 0.88258755, + "learning_rate": 0.000601170936581704, + "loss": 0.89344376, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.27270508, + "step": 2350, + "time_per_iteration": 2.5604915618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088839, + "balance_loss_mlp": 1.06101537, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.07551987134141444, + "language_loss": 0.84626472, + "learning_rate": 0.0006008658189215121, + "loss": 0.85715318, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.27832031, + "step": 2351, + "time_per_iteration": 2.6299045085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_mlp": 1.07158601, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.07553479525673996, + "language_loss": 0.79898262, + "learning_rate": 0.0006005606621017366, + "loss": 0.80998385, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.28540039, + "step": 2352, + "time_per_iteration": 2.58725905418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.06732249, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05769795994016392, + "language_loss": 0.8022939, + "learning_rate": 0.0006002554662408496, + "loss": 0.81325346, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.28637695, + "step": 2353, + "time_per_iteration": 2.9054527282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089231, + "balance_loss_mlp": 1.06078792, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.07238968138349489, + "language_loss": 0.91292691, + "learning_rate": 0.0005999502314573388, + "loss": 0.92381918, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.28393555, + "step": 2354, + "time_per_iteration": 2.6389734745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05656958, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.0719451372015111, + "language_loss": 0.86045247, + "learning_rate": 0.0005996449578697066, + "loss": 0.87130976, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.29174805, + "step": 2355, + "time_per_iteration": 2.6851072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_mlp": 1.06634867, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05612545408526447, + "language_loss": 0.81111002, + "learning_rate": 0.0005993396455964709, + "loss": 0.82205319, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.2800293, + "step": 2356, + "time_per_iteration": 2.6760780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095343, + "balance_loss_mlp": 1.06754375, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.05702970789361519, + "language_loss": 0.81782162, + "learning_rate": 0.0005990342947561647, + "loss": 0.82877505, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.27856445, + "step": 2357, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108513, + "balance_loss_mlp": 1.07949746, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.06168719534303639, + "language_loss": 0.77822679, + "learning_rate": 0.0005987289054673351, + "loss": 0.78931195, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.28979492, + "step": 2358, + "time_per_iteration": 2.6254196166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191784, + "balance_loss_mlp": 1.18038785, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.06020491976481073, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77767521, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11376953, + "step": 2359, + "time_per_iteration": 4.803730010986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112502, + "balance_loss_mlp": 1.08300948, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.06904936924963041, + "language_loss": 0.90802431, + "learning_rate": 0.0005981180120183722, + "loss": 0.91914928, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.29443359, + "step": 2360, + "time_per_iteration": 2.672501564025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_mlp": 1.08560812, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.18994365983189826, + "language_loss": 0.85107553, + "learning_rate": 0.0005978125080954089, + "loss": 0.86222672, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.29492188, + "step": 2361, + "time_per_iteration": 2.7426631450653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111841, + "balance_loss_mlp": 1.0814904, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.07946717837388541, + "language_loss": 0.76933616, + "learning_rate": 0.000597506966198262, + "loss": 0.78045452, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.30297852, + "step": 2362, + "time_per_iteration": 2.9498252868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113617, + "balance_loss_mlp": 1.08438706, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.08220053414262748, + "language_loss": 0.83964276, + "learning_rate": 0.0005972013864455536, + "loss": 0.85077894, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.29199219, + "step": 2363, + "time_per_iteration": 2.623084545135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.0844152, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.07689777421943021, + "language_loss": 0.84891784, + "learning_rate": 0.0005968957689559203, + "loss": 0.86004549, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.28369141, + "step": 2364, + "time_per_iteration": 4.15172266960144 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_mlp": 1.07492638, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.0791653109712497, + "language_loss": 0.88481373, + "learning_rate": 0.0005965901138480131, + "loss": 0.89584458, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.28173828, + "step": 2365, + "time_per_iteration": 2.5800631046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097109, + "balance_loss_mlp": 1.06840384, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.06578783357270249, + "language_loss": 0.87197572, + "learning_rate": 0.0005962844212404982, + "loss": 0.88294685, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.28686523, + "step": 2366, + "time_per_iteration": 2.6940040588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091654, + "balance_loss_mlp": 1.06344962, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.05998271622094208, + "language_loss": 0.86890531, + "learning_rate": 0.0005959786912520558, + "loss": 0.87982178, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.2824707, + "step": 2367, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096727, + "balance_loss_mlp": 1.06854558, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04792571197867491, + "language_loss": 0.83765805, + "learning_rate": 0.0005956729240013806, + "loss": 0.8486253, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.28173828, + "step": 2368, + "time_per_iteration": 2.8546009063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.08035553, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.054790339147135006, + "language_loss": 0.91898453, + "learning_rate": 0.0005953671196071824, + "loss": 0.93007344, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.28540039, + "step": 2369, + "time_per_iteration": 2.7034096717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115288, + "balance_loss_mlp": 1.08767939, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05736115779957956, + "language_loss": 0.79610699, + "learning_rate": 0.0005950612781881846, + "loss": 0.8072598, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.27636719, + "step": 2370, + "time_per_iteration": 2.707674264907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124856, + "balance_loss_mlp": 1.09662771, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.08139155344435882, + "language_loss": 0.75630575, + "learning_rate": 0.0005947553998631259, + "loss": 0.76755428, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.2824707, + "step": 2371, + "time_per_iteration": 2.8811731338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125619, + "balance_loss_mlp": 1.09770048, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.07117752980456016, + "language_loss": 0.79090154, + "learning_rate": 0.000594449484750758, + "loss": 0.80215776, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.27905273, + "step": 2372, + "time_per_iteration": 3.1549901962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08807683, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.061849801440599636, + "language_loss": 0.82697588, + "learning_rate": 0.0005941435329698484, + "loss": 0.83814585, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.2890625, + "step": 2373, + "time_per_iteration": 2.6593072414398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118584, + "balance_loss_mlp": 1.09054554, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.06278217801879041, + "language_loss": 0.83130741, + "learning_rate": 0.0005938375446391778, + "loss": 0.8424933, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.28051758, + "step": 2374, + "time_per_iteration": 2.7434608936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.09563541, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06820583935841042, + "language_loss": 0.89043015, + "learning_rate": 0.0005935315198775415, + "loss": 0.90167212, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.28540039, + "step": 2375, + "time_per_iteration": 2.6057205200195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_mlp": 1.08558059, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.07601718344596131, + "language_loss": 0.87262166, + "learning_rate": 0.0005932254588037486, + "loss": 0.88375497, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.27783203, + "step": 2376, + "time_per_iteration": 2.4881751537323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103499, + "balance_loss_mlp": 1.07462692, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.07182864232109534, + "language_loss": 0.86405516, + "learning_rate": 0.000592919361536623, + "loss": 0.87509012, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.28857422, + "step": 2377, + "time_per_iteration": 2.6453545093536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07376885, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06032083182665244, + "language_loss": 0.88920552, + "learning_rate": 0.0005926132281950017, + "loss": 0.90022385, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.28076172, + "step": 2378, + "time_per_iteration": 2.7356886863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.0672735, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.07556174313152972, + "language_loss": 0.8485238, + "learning_rate": 0.0005923070588977367, + "loss": 0.8594898, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.29248047, + "step": 2379, + "time_per_iteration": 2.812110185623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095202, + "balance_loss_mlp": 1.0665921, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.0597594421207511, + "language_loss": 0.86065739, + "learning_rate": 0.0005920008537636931, + "loss": 0.87160945, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.28613281, + "step": 2380, + "time_per_iteration": 2.8955793380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094751, + "balance_loss_mlp": 1.06518722, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.08202954174104495, + "language_loss": 0.86535549, + "learning_rate": 0.0005916946129117504, + "loss": 0.87630302, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.29516602, + "step": 2381, + "time_per_iteration": 2.8850152492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.05958724, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06022733145419036, + "language_loss": 0.80483937, + "learning_rate": 0.0005913883364608017, + "loss": 0.81573421, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.29833984, + "step": 2382, + "time_per_iteration": 3.0977792739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092347, + "balance_loss_mlp": 1.06225872, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.07912283694355432, + "language_loss": 0.88849449, + "learning_rate": 0.0005910820245297542, + "loss": 0.899418, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.30053711, + "step": 2383, + "time_per_iteration": 2.905977964401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081098, + "balance_loss_mlp": 1.05055714, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06971122212551431, + "language_loss": 0.810808, + "learning_rate": 0.000590775677237529, + "loss": 0.82161897, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.30517578, + "step": 2384, + "time_per_iteration": 2.7233986854553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078055, + "balance_loss_mlp": 1.04810929, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.10145803635005178, + "language_loss": 0.79860461, + "learning_rate": 0.0005904692947030601, + "loss": 0.80938518, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.29882812, + "step": 2385, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04647207, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.08299143875661358, + "language_loss": 0.89372921, + "learning_rate": 0.0005901628770452963, + "loss": 0.90449417, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.29956055, + "step": 2386, + "time_per_iteration": 2.56011700630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_mlp": 1.04586029, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05953614440228025, + "language_loss": 0.87499726, + "learning_rate": 0.000589856424383199, + "loss": 0.88575506, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.29882812, + "step": 2387, + "time_per_iteration": 2.622857093811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.04762435, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.06461384040637212, + "language_loss": 0.8283028, + "learning_rate": 0.000589549936835744, + "loss": 0.83908516, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.30566406, + "step": 2388, + "time_per_iteration": 2.9280176162719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082083, + "balance_loss_mlp": 1.0514698, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.07025219360641571, + "language_loss": 0.79160953, + "learning_rate": 0.0005892434145219202, + "loss": 0.80243033, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.30566406, + "step": 2389, + "time_per_iteration": 2.632772207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081464, + "balance_loss_mlp": 1.050946, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.060348492919292666, + "language_loss": 0.82535923, + "learning_rate": 0.0005889368575607303, + "loss": 0.83617389, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.3046875, + "step": 2390, + "time_per_iteration": 2.815487861633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094579, + "balance_loss_mlp": 1.06358492, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05491617941274289, + "language_loss": 0.78348118, + "learning_rate": 0.00058863026607119, + "loss": 0.79442704, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.30957031, + "step": 2391, + "time_per_iteration": 3.0853166580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0620811, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.05825671270919626, + "language_loss": 0.79661655, + "learning_rate": 0.0005883236401723287, + "loss": 0.80753851, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.30078125, + "step": 2392, + "time_per_iteration": 3.1643104553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096169, + "balance_loss_mlp": 1.06536531, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.06457998167472197, + "language_loss": 0.84046978, + "learning_rate": 0.0005880169799831893, + "loss": 0.85143149, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.30761719, + "step": 2393, + "time_per_iteration": 2.6935391426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096173, + "balance_loss_mlp": 1.0654645, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.06354744392782355, + "language_loss": 0.81838334, + "learning_rate": 0.0005877102856228278, + "loss": 0.82934511, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.30664062, + "step": 2394, + "time_per_iteration": 2.8314805030822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097821, + "balance_loss_mlp": 1.06713629, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0665210460005036, + "language_loss": 0.84696203, + "learning_rate": 0.0005874035572103133, + "loss": 0.8579402, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.30664062, + "step": 2395, + "time_per_iteration": 2.6893725395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098408, + "balance_loss_mlp": 1.0673902, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.1082823786036068, + "language_loss": 0.82554322, + "learning_rate": 0.0005870967948647288, + "loss": 0.83652729, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.30981445, + "step": 2396, + "time_per_iteration": 2.7625200748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191183, + "balance_loss_mlp": 1.1745894, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.05861502253959749, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75499487, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.16601562, + "step": 2397, + "time_per_iteration": 5.363407850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090965, + "balance_loss_mlp": 1.06028056, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.08876233940236913, + "language_loss": 0.85477209, + "learning_rate": 0.0005864831688507443, + "loss": 0.86568171, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.30639648, + "step": 2398, + "time_per_iteration": 2.9619805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081398, + "balance_loss_mlp": 1.05119061, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.06931834879873142, + "language_loss": 0.75342947, + "learning_rate": 0.0005861763054205754, + "loss": 0.76424348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.30151367, + "step": 2399, + "time_per_iteration": 2.7531988620758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091818, + "balance_loss_mlp": 1.06213522, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.05751461156756605, + "language_loss": 0.80467141, + "learning_rate": 0.0005858694085337976, + "loss": 0.81558955, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.29614258, + "step": 2400, + "time_per_iteration": 2.814182758331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083104, + "balance_loss_mlp": 1.05246735, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.07664119673877032, + "language_loss": 0.8354007, + "learning_rate": 0.0005855624783095589, + "loss": 0.8462317, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.30615234, + "step": 2401, + "time_per_iteration": 2.57083797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_mlp": 1.05414128, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.06712435829168825, + "language_loss": 0.85380065, + "learning_rate": 0.00058525551486702, + "loss": 0.864636, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.29370117, + "step": 2402, + "time_per_iteration": 2.554870843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_mlp": 1.05476141, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06447976336023753, + "language_loss": 0.80940902, + "learning_rate": 0.0005849485183253548, + "loss": 0.82025588, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.29882812, + "step": 2403, + "time_per_iteration": 2.6398868560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.05546916, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.07099246909711197, + "language_loss": 0.87546206, + "learning_rate": 0.0005846414888037501, + "loss": 0.88631094, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.29345703, + "step": 2404, + "time_per_iteration": 2.5056095123291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086728, + "balance_loss_mlp": 1.05725932, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.052798237228442416, + "language_loss": 0.82345319, + "learning_rate": 0.0005843344264214049, + "loss": 0.83432049, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.29443359, + "step": 2405, + "time_per_iteration": 2.7549078464508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.06176221, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.05337180485738099, + "language_loss": 0.84920704, + "learning_rate": 0.0005840273312975317, + "loss": 0.8601203, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.29516602, + "step": 2406, + "time_per_iteration": 2.9058027267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085122, + "balance_loss_mlp": 1.05577278, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.05333458165520064, + "language_loss": 0.89626014, + "learning_rate": 0.0005837202035513555, + "loss": 0.90711135, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.29345703, + "step": 2407, + "time_per_iteration": 2.5721802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094311, + "balance_loss_mlp": 1.06531978, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.0552743160267319, + "language_loss": 0.81124538, + "learning_rate": 0.0005834130433021136, + "loss": 0.8221885, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.28930664, + "step": 2408, + "time_per_iteration": 2.7402079105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.06166446, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.09526074365649402, + "language_loss": 0.73246038, + "learning_rate": 0.0005831058506690563, + "loss": 0.74337649, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.29931641, + "step": 2409, + "time_per_iteration": 2.6229617595672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088655, + "balance_loss_mlp": 1.05875707, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.061078353708003665, + "language_loss": 0.85864687, + "learning_rate": 0.0005827986257714464, + "loss": 0.86953342, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.29858398, + "step": 2410, + "time_per_iteration": 2.9352338314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094131, + "balance_loss_mlp": 1.06404257, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.05695764594036898, + "language_loss": 0.88375425, + "learning_rate": 0.0005824913687285591, + "loss": 0.89469558, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.30078125, + "step": 2411, + "time_per_iteration": 2.6807737350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097526, + "balance_loss_mlp": 1.06698477, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.0643729084989199, + "language_loss": 0.81849819, + "learning_rate": 0.0005821840796596821, + "loss": 0.82947344, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.30493164, + "step": 2412, + "time_per_iteration": 2.663177967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096211, + "balance_loss_mlp": 1.0657649, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.07601159389817994, + "language_loss": 0.80307502, + "learning_rate": 0.0005818767586841158, + "loss": 0.81403708, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.30419922, + "step": 2413, + "time_per_iteration": 2.7600111961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092616, + "balance_loss_mlp": 1.06233692, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.059484167412089096, + "language_loss": 0.86110759, + "learning_rate": 0.0005815694059211726, + "loss": 0.87203372, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.30249023, + "step": 2414, + "time_per_iteration": 2.65578031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148176, + "balance_loss_mlp": 1.13263142, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.0462911781552321, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82021809, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.15527344, + "step": 2415, + "time_per_iteration": 4.8046934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_mlp": 1.10092187, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.038481348382240925, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78060573, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.14550781, + "step": 2416, + "time_per_iteration": 4.977246999740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.05554748, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.07046148078843767, + "language_loss": 0.85802382, + "learning_rate": 0.0005806471581013931, + "loss": 0.86888373, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.30395508, + "step": 2417, + "time_per_iteration": 2.7680604457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_mlp": 1.05363095, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.061868019756872866, + "language_loss": 0.78540701, + "learning_rate": 0.0005803396793823146, + "loss": 0.7962473, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.30371094, + "step": 2418, + "time_per_iteration": 2.818821430206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081583, + "balance_loss_mlp": 1.05213845, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.08069009721002836, + "language_loss": 0.8594386, + "learning_rate": 0.0005800321694726065, + "loss": 0.8702544, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.29418945, + "step": 2419, + "time_per_iteration": 2.812563896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_mlp": 1.05454159, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.061646313113324705, + "language_loss": 0.86883628, + "learning_rate": 0.0005797246284916545, + "loss": 0.87968636, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.30444336, + "step": 2420, + "time_per_iteration": 2.6945559978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_mlp": 1.02332675, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.024509703594541715, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78539675, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.11181641, + "step": 2421, + "time_per_iteration": 5.001375436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089527, + "balance_loss_mlp": 1.06036878, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.07023208249232396, + "language_loss": 0.8781141, + "learning_rate": 0.0005791094537936233, + "loss": 0.88900936, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.29150391, + "step": 2422, + "time_per_iteration": 2.703678846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010888, + "balance_loss_mlp": 1.06028509, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.06283657209164231, + "language_loss": 0.817285, + "learning_rate": 0.0005788018203153762, + "loss": 0.82817304, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.28515625, + "step": 2423, + "time_per_iteration": 2.6398653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081237, + "balance_loss_mlp": 1.05255485, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.0646507393923986, + "language_loss": 0.85720015, + "learning_rate": 0.000578494156243549, + "loss": 0.86801249, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.28686523, + "step": 2424, + "time_per_iteration": 2.6061441898345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086736, + "balance_loss_mlp": 1.05695724, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.05149395612804314, + "language_loss": 0.89174867, + "learning_rate": 0.0005781864616975878, + "loss": 0.90261602, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.29736328, + "step": 2425, + "time_per_iteration": 2.7073817253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05917215, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.0742004751674347, + "language_loss": 0.84101117, + "learning_rate": 0.0005778787367969502, + "loss": 0.85188806, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.28515625, + "step": 2426, + "time_per_iteration": 2.643342971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082589, + "balance_loss_mlp": 1.05374038, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.05195761556147334, + "language_loss": 0.80815637, + "learning_rate": 0.0005775709816611053, + "loss": 0.81898224, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.28857422, + "step": 2427, + "time_per_iteration": 3.0103423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085111, + "balance_loss_mlp": 1.05604792, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05192902090033842, + "language_loss": 0.83742678, + "learning_rate": 0.0005772631964095346, + "loss": 0.84827781, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.29003906, + "step": 2428, + "time_per_iteration": 4.2191994190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010894, + "balance_loss_mlp": 1.06107569, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.05894584384100732, + "language_loss": 0.85613596, + "learning_rate": 0.000576955381161731, + "loss": 0.86702996, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.28320312, + "step": 2429, + "time_per_iteration": 2.7035927772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.05297327, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.07711305585297333, + "language_loss": 0.8606714, + "learning_rate": 0.0005766475360371985, + "loss": 0.87149525, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.29394531, + "step": 2430, + "time_per_iteration": 2.5702948570251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092231, + "balance_loss_mlp": 1.06292963, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.08342834969675962, + "language_loss": 0.84959614, + "learning_rate": 0.0005763396611554536, + "loss": 0.86051846, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.29248047, + "step": 2431, + "time_per_iteration": 2.6236841678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092277, + "balance_loss_mlp": 1.06383383, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.06223220956170435, + "language_loss": 0.80269897, + "learning_rate": 0.0005760317566360237, + "loss": 0.81362176, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.28466797, + "step": 2432, + "time_per_iteration": 3.0205023288726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_mlp": 1.0559535, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.058294757950733474, + "language_loss": 0.85130137, + "learning_rate": 0.000575723822598448, + "loss": 0.86214417, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.28295898, + "step": 2433, + "time_per_iteration": 2.79516339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086726, + "balance_loss_mlp": 1.05866385, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.06256497191901454, + "language_loss": 0.81601393, + "learning_rate": 0.0005754158591622773, + "loss": 0.82688123, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.28076172, + "step": 2434, + "time_per_iteration": 2.963247537612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092504, + "balance_loss_mlp": 1.06365538, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.08333045297400817, + "language_loss": 0.8228929, + "learning_rate": 0.0005751078664470732, + "loss": 0.83381796, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.28833008, + "step": 2435, + "time_per_iteration": 2.537179470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_mlp": 1.05688024, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.08080859282065189, + "language_loss": 0.85670036, + "learning_rate": 0.0005747998445724094, + "loss": 0.86755049, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.28125, + "step": 2436, + "time_per_iteration": 2.6276183128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083485, + "balance_loss_mlp": 1.05466008, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.08810611044699188, + "language_loss": 0.89099967, + "learning_rate": 0.0005744917936578707, + "loss": 0.90183449, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.28808594, + "step": 2437, + "time_per_iteration": 2.784236431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085755, + "balance_loss_mlp": 1.05690634, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.08777270325229546, + "language_loss": 0.83928555, + "learning_rate": 0.0005741837138230526, + "loss": 0.85014307, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.28808594, + "step": 2438, + "time_per_iteration": 2.7139840126037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078469, + "balance_loss_mlp": 1.05014467, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.053438427497709357, + "language_loss": 0.86270201, + "learning_rate": 0.0005738756051875627, + "loss": 0.87348676, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.28295898, + "step": 2439, + "time_per_iteration": 3.092337131500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074485, + "balance_loss_mlp": 1.04551697, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.056335724754341315, + "language_loss": 0.83459938, + "learning_rate": 0.0005735674678710192, + "loss": 0.84534419, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.28930664, + "step": 2440, + "time_per_iteration": 2.6729819774627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107755, + "balance_loss_mlp": 1.0473665, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.06862136292067082, + "language_loss": 0.80992246, + "learning_rate": 0.0005732593019930517, + "loss": 0.82069802, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.30126953, + "step": 2441, + "time_per_iteration": 2.917332649230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078244, + "balance_loss_mlp": 1.04779828, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.06788307957029095, + "language_loss": 0.8767302, + "learning_rate": 0.0005729511076733008, + "loss": 0.88751262, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.30395508, + "step": 2442, + "time_per_iteration": 2.6602578163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108041, + "balance_loss_mlp": 1.05003536, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.08414136163770505, + "language_loss": 0.84802854, + "learning_rate": 0.000572642885031418, + "loss": 0.85883266, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.30322266, + "step": 2443, + "time_per_iteration": 2.924572706222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075591, + "balance_loss_mlp": 1.04516852, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.055800438037163856, + "language_loss": 0.80518812, + "learning_rate": 0.0005723346341870662, + "loss": 0.81594402, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.30371094, + "step": 2444, + "time_per_iteration": 2.7203280925750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05217505, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.06929087535104682, + "language_loss": 0.86297798, + "learning_rate": 0.0005720263552599188, + "loss": 0.87380457, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.30444336, + "step": 2445, + "time_per_iteration": 2.469621419906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075882, + "balance_loss_mlp": 1.0456984, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.06843850090218344, + "language_loss": 0.79142129, + "learning_rate": 0.0005717180483696604, + "loss": 0.80218005, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.30151367, + "step": 2446, + "time_per_iteration": 2.9089763164520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072219, + "balance_loss_mlp": 1.04034209, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.07381367232784701, + "language_loss": 0.83118802, + "learning_rate": 0.0005714097136359862, + "loss": 0.84191024, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.31860352, + "step": 2447, + "time_per_iteration": 2.6346585750579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04817808, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.06979677359463858, + "language_loss": 0.86324209, + "learning_rate": 0.0005711013511786027, + "loss": 0.87403476, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.31054688, + "step": 2448, + "time_per_iteration": 2.765740156173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073046, + "balance_loss_mlp": 1.0426712, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.048536468835106476, + "language_loss": 0.84014428, + "learning_rate": 0.0005707929611172263, + "loss": 0.85087478, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3034668, + "step": 2449, + "time_per_iteration": 2.6891775131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074493, + "balance_loss_mlp": 1.04349887, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05569215031080998, + "language_loss": 0.83788037, + "learning_rate": 0.000570484543571585, + "loss": 0.84862536, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.30957031, + "step": 2450, + "time_per_iteration": 2.545646905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076975, + "balance_loss_mlp": 1.04743469, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.06210999897734131, + "language_loss": 0.82771122, + "learning_rate": 0.0005701760986614171, + "loss": 0.83848095, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.29492188, + "step": 2451, + "time_per_iteration": 2.5739784240722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080958, + "balance_loss_mlp": 1.05256283, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.06034093462601522, + "language_loss": 0.87343812, + "learning_rate": 0.0005698676265064714, + "loss": 0.88424772, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.28393555, + "step": 2452, + "time_per_iteration": 2.5456669330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_mlp": 1.05612302, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.12010658803535784, + "language_loss": 0.88854802, + "learning_rate": 0.0005695591272265074, + "loss": 0.89940351, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.29370117, + "step": 2453, + "time_per_iteration": 2.53247332572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.05610394, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.06319040539886057, + "language_loss": 0.81670743, + "learning_rate": 0.0005692506009412954, + "loss": 0.8275677, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.29907227, + "step": 2454, + "time_per_iteration": 2.663959503173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157874, + "balance_loss_mlp": 1.14423668, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.046124065416459865, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78709137, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.13671875, + "step": 2455, + "time_per_iteration": 4.937524795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085858, + "balance_loss_mlp": 1.05603182, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.07174058927835297, + "language_loss": 0.89622641, + "learning_rate": 0.0005686334678342593, + "loss": 0.907085, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.2980957, + "step": 2456, + "time_per_iteration": 2.9060487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077496, + "balance_loss_mlp": 1.04824257, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.07069871267474889, + "language_loss": 0.81667411, + "learning_rate": 0.0005683248612520274, + "loss": 0.82744908, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.29223633, + "step": 2457, + "time_per_iteration": 3.071544885635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_mlp": 1.05465865, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.07071545002601118, + "language_loss": 0.83683658, + "learning_rate": 0.0005680162281437321, + "loss": 0.84768021, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.296875, + "step": 2458, + "time_per_iteration": 2.931579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077685, + "balance_loss_mlp": 1.0476439, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.06018673388195985, + "language_loss": 0.84837544, + "learning_rate": 0.000567707568629195, + "loss": 0.85915226, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.30004883, + "step": 2459, + "time_per_iteration": 2.6860852241516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079226, + "balance_loss_mlp": 1.04968619, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.053752412093893094, + "language_loss": 0.82513988, + "learning_rate": 0.0005673988828282486, + "loss": 0.83593214, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.29467773, + "step": 2460, + "time_per_iteration": 2.6679980754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.04320669, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.05735836881189746, + "language_loss": 0.80829632, + "learning_rate": 0.0005670901708607352, + "loss": 0.81903076, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.30175781, + "step": 2461, + "time_per_iteration": 2.962364673614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076898, + "balance_loss_mlp": 1.04635668, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06660215000338995, + "language_loss": 0.84026098, + "learning_rate": 0.0005667814328465076, + "loss": 0.85102999, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.30493164, + "step": 2462, + "time_per_iteration": 2.6148030757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077856, + "balance_loss_mlp": 1.04824424, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.0820641824195461, + "language_loss": 0.81702316, + "learning_rate": 0.0005664726689054285, + "loss": 0.8278017, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.29541016, + "step": 2463, + "time_per_iteration": 2.46337628364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.04910851, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.07270387927239072, + "language_loss": 0.81341946, + "learning_rate": 0.0005661638791573704, + "loss": 0.82421935, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.30859375, + "step": 2464, + "time_per_iteration": 2.712188720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_mlp": 1.05453193, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.05714322793938323, + "language_loss": 0.87222457, + "learning_rate": 0.0005658550637222164, + "loss": 0.88307238, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.30224609, + "step": 2465, + "time_per_iteration": 2.63380765914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082927, + "balance_loss_mlp": 1.05298185, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.06339144108901118, + "language_loss": 0.82493532, + "learning_rate": 0.0005655462227198592, + "loss": 0.83576465, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.29907227, + "step": 2466, + "time_per_iteration": 2.910783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_mlp": 1.0547595, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05460968765214119, + "language_loss": 0.83975738, + "learning_rate": 0.0005652373562702016, + "loss": 0.85060585, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.30053711, + "step": 2467, + "time_per_iteration": 2.6101505756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.05072081, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.06618054462006194, + "language_loss": 0.88145614, + "learning_rate": 0.000564928464493156, + "loss": 0.89226621, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.30249023, + "step": 2468, + "time_per_iteration": 2.55812668800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081635, + "balance_loss_mlp": 1.05247641, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.06741069565287812, + "language_loss": 0.81633413, + "learning_rate": 0.000564619547508645, + "loss": 0.82715052, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.29150391, + "step": 2469, + "time_per_iteration": 3.1341404914855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082878, + "balance_loss_mlp": 1.05252695, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.0651779420020333, + "language_loss": 0.83088791, + "learning_rate": 0.0005643106054366008, + "loss": 0.84171665, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.30297852, + "step": 2470, + "time_per_iteration": 2.610891342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.04666018, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.0714119485898344, + "language_loss": 0.79053152, + "learning_rate": 0.000564001638396965, + "loss": 0.80129188, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.29321289, + "step": 2471, + "time_per_iteration": 2.7754971981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083604, + "balance_loss_mlp": 1.05430186, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05565021284268994, + "language_loss": 0.8203246, + "learning_rate": 0.0005636926465096897, + "loss": 0.83116066, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.29248047, + "step": 2472, + "time_per_iteration": 3.028235912322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079414, + "balance_loss_mlp": 1.05116105, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.06838176056824781, + "language_loss": 0.87627274, + "learning_rate": 0.0005633836298947363, + "loss": 0.8870669, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.28271484, + "step": 2473, + "time_per_iteration": 2.609142303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04901338, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.06111056533479294, + "language_loss": 0.70809621, + "learning_rate": 0.000563074588672075, + "loss": 0.71887386, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.28759766, + "step": 2474, + "time_per_iteration": 2.722593069076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_mlp": 1.05080247, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.06296236889432077, + "language_loss": 0.85321903, + "learning_rate": 0.0005627655229616868, + "loss": 0.8640129, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.28540039, + "step": 2475, + "time_per_iteration": 2.711296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081174, + "balance_loss_mlp": 1.05141973, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.06122384611792148, + "language_loss": 0.89890903, + "learning_rate": 0.0005624564328835616, + "loss": 0.90972078, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.29736328, + "step": 2476, + "time_per_iteration": 2.796614408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05069184, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.05962569805242902, + "language_loss": 0.84079456, + "learning_rate": 0.0005621473185576986, + "loss": 0.85158479, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.28344727, + "step": 2477, + "time_per_iteration": 2.7140815258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.05709434, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.07093607725441804, + "language_loss": 0.87060082, + "learning_rate": 0.0005618381801041068, + "loss": 0.88146281, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.29077148, + "step": 2478, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085469, + "balance_loss_mlp": 1.05638218, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.07057707739429774, + "language_loss": 0.83022285, + "learning_rate": 0.0005615290176428044, + "loss": 0.84107757, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.29052734, + "step": 2479, + "time_per_iteration": 2.6407430171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108759, + "balance_loss_mlp": 1.05828834, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06449831218896054, + "language_loss": 0.85197705, + "learning_rate": 0.0005612198312938187, + "loss": 0.86285299, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.29296875, + "step": 2480, + "time_per_iteration": 2.7345011234283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108973, + "balance_loss_mlp": 1.06121504, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.060218704260060575, + "language_loss": 0.79185855, + "learning_rate": 0.0005609106211771868, + "loss": 0.80275583, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.28540039, + "step": 2481, + "time_per_iteration": 2.8754329681396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.05908394, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07327776648741448, + "language_loss": 0.89180911, + "learning_rate": 0.0005606013874129543, + "loss": 0.90269172, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.29199219, + "step": 2482, + "time_per_iteration": 2.7726404666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090058, + "balance_loss_mlp": 1.06049454, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.06456332848164101, + "language_loss": 0.79976207, + "learning_rate": 0.0005602921301211768, + "loss": 0.81066263, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.29516602, + "step": 2483, + "time_per_iteration": 2.715306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089436, + "balance_loss_mlp": 1.06132603, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07998801300028703, + "language_loss": 0.82180744, + "learning_rate": 0.0005599828494219185, + "loss": 0.83270174, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.28100586, + "step": 2484, + "time_per_iteration": 2.5683019161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086424, + "balance_loss_mlp": 1.05836201, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.06543459725570545, + "language_loss": 0.88914174, + "learning_rate": 0.0005596735454352527, + "loss": 0.90000606, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.28076172, + "step": 2485, + "time_per_iteration": 2.8615424633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083119, + "balance_loss_mlp": 1.05531943, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07228586186756063, + "language_loss": 0.85170126, + "learning_rate": 0.0005593642182812619, + "loss": 0.8625325, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.27856445, + "step": 2486, + "time_per_iteration": 2.6507115364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_mlp": 1.0574224, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.06671866930909515, + "language_loss": 0.83972216, + "learning_rate": 0.0005590548680800378, + "loss": 0.85056645, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.27050781, + "step": 2487, + "time_per_iteration": 3.0963587760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085422, + "balance_loss_mlp": 1.05755091, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.0627787894989405, + "language_loss": 0.7639966, + "learning_rate": 0.0005587454949516804, + "loss": 0.77485085, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.27880859, + "step": 2488, + "time_per_iteration": 2.704761266708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085753, + "balance_loss_mlp": 1.05719018, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.07191070894190046, + "language_loss": 0.87996674, + "learning_rate": 0.0005584360990162993, + "loss": 0.89082426, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.28540039, + "step": 2489, + "time_per_iteration": 2.68680477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_mlp": 1.05921531, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.052754850289178916, + "language_loss": 0.85114515, + "learning_rate": 0.0005581266803940124, + "loss": 0.86201936, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.28222656, + "step": 2490, + "time_per_iteration": 2.7187392711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.06322539, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.061347112520969346, + "language_loss": 0.87164974, + "learning_rate": 0.0005578172392049471, + "loss": 0.8825624, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.28051758, + "step": 2491, + "time_per_iteration": 2.7291457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089047, + "balance_loss_mlp": 1.06048441, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.07263845202824909, + "language_loss": 0.84244549, + "learning_rate": 0.0005575077755692386, + "loss": 0.85333598, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.28564453, + "step": 2492, + "time_per_iteration": 2.8026599884033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080078, + "balance_loss_mlp": 1.05246925, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0504022340685432, + "language_loss": 0.85800493, + "learning_rate": 0.0005571982896070316, + "loss": 0.86880577, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.27612305, + "step": 2493, + "time_per_iteration": 2.655550003051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080752, + "balance_loss_mlp": 1.05266619, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.11668407926682704, + "language_loss": 0.89753431, + "learning_rate": 0.0005568887814384792, + "loss": 0.90834183, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.28100586, + "step": 2494, + "time_per_iteration": 2.5966434478759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080843, + "balance_loss_mlp": 1.05337763, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.058142169565221447, + "language_loss": 0.87224984, + "learning_rate": 0.000556579251183743, + "loss": 0.88305831, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.27490234, + "step": 2495, + "time_per_iteration": 2.6536028385162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080101, + "balance_loss_mlp": 1.05089474, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06356237967295801, + "language_loss": 0.7994827, + "learning_rate": 0.0005562696989629936, + "loss": 0.81028366, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.29174805, + "step": 2496, + "time_per_iteration": 2.691530466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_mlp": 1.05328333, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.07544069195311896, + "language_loss": 0.82662058, + "learning_rate": 0.0005559601248964095, + "loss": 0.83744615, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.29223633, + "step": 2497, + "time_per_iteration": 2.687108278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078067, + "balance_loss_mlp": 1.04931426, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.07160134617119021, + "language_loss": 0.85915172, + "learning_rate": 0.0005556505291041783, + "loss": 0.86993241, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.28735352, + "step": 2498, + "time_per_iteration": 2.7002923488616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.05264211, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.21407023754506424, + "language_loss": 0.84214193, + "learning_rate": 0.0005553409117064954, + "loss": 0.85295641, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.2878418, + "step": 2499, + "time_per_iteration": 2.877713203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096264, + "balance_loss_mlp": 1.06824946, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.06103635462331165, + "language_loss": 0.84855151, + "learning_rate": 0.0005550312728235654, + "loss": 0.85951412, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.28051758, + "step": 2500, + "time_per_iteration": 2.716524362564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094238, + "balance_loss_mlp": 1.06610465, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.07633647670380422, + "language_loss": 0.83599609, + "learning_rate": 0.0005547216125756003, + "loss": 0.84693843, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.28125, + "step": 2501, + "time_per_iteration": 2.8102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097276, + "balance_loss_mlp": 1.06899917, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.05816521463755192, + "language_loss": 0.81801546, + "learning_rate": 0.0005544119310828211, + "loss": 0.82898819, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.28295898, + "step": 2502, + "time_per_iteration": 3.09083890914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110256, + "balance_loss_mlp": 1.08162141, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.07468975257849066, + "language_loss": 0.84463918, + "learning_rate": 0.0005541022284654568, + "loss": 0.85574174, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.28613281, + "step": 2503, + "time_per_iteration": 2.959812641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105243, + "balance_loss_mlp": 1.07613182, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.06287004960739773, + "language_loss": 0.83878344, + "learning_rate": 0.0005537925048437446, + "loss": 0.84983587, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.29077148, + "step": 2504, + "time_per_iteration": 2.5965919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113897, + "balance_loss_mlp": 1.12542796, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.039351692623908835, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76890433, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.13574219, + "step": 2505, + "time_per_iteration": 4.965132713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112409, + "balance_loss_mlp": 1.08420432, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.06703534425937603, + "language_loss": 0.88412756, + "learning_rate": 0.0005531729950682664, + "loss": 0.89525163, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.28198242, + "step": 2506, + "time_per_iteration": 3.032463550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107907, + "balance_loss_mlp": 1.07936859, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.08139997578259908, + "language_loss": 0.84598732, + "learning_rate": 0.000552863209155015, + "loss": 0.85706639, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.28564453, + "step": 2507, + "time_per_iteration": 2.501650333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101488, + "balance_loss_mlp": 1.07285357, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.06119014713123412, + "language_loss": 0.81909472, + "learning_rate": 0.0005525534027184461, + "loss": 0.83010966, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.28637695, + "step": 2508, + "time_per_iteration": 2.5787370204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098365, + "balance_loss_mlp": 1.06942117, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.05313984540081721, + "language_loss": 0.82654703, + "learning_rate": 0.0005522435758788365, + "loss": 0.83753073, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.28930664, + "step": 2509, + "time_per_iteration": 2.7109761238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010953, + "balance_loss_mlp": 1.06730938, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.05877851050813853, + "language_loss": 0.80259538, + "learning_rate": 0.0005519337287564721, + "loss": 0.81354833, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.2800293, + "step": 2510, + "time_per_iteration": 2.8329310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109601, + "balance_loss_mlp": 1.06759048, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.060327319620096846, + "language_loss": 0.83688086, + "learning_rate": 0.000551623861471646, + "loss": 0.84784102, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.28417969, + "step": 2511, + "time_per_iteration": 2.7470946311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_mlp": 1.08784056, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.03397215547055983, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79919541, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.12890625, + "step": 2512, + "time_per_iteration": 4.837340593338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095094, + "balance_loss_mlp": 1.06619751, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.059215268588021376, + "language_loss": 0.86540532, + "learning_rate": 0.0005510040668958211, + "loss": 0.87635624, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.2890625, + "step": 2513, + "time_per_iteration": 2.5706045627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.06364644, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.0265804362292035, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78836721, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.12451172, + "step": 2514, + "time_per_iteration": 4.899883508682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.0589062, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05909251781800444, + "language_loss": 0.83435559, + "learning_rate": 0.0005503841931138645, + "loss": 0.84523714, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.29272461, + "step": 2515, + "time_per_iteration": 2.665804386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.06112456, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.06787127022085944, + "language_loss": 0.81963372, + "learning_rate": 0.0005500742268214025, + "loss": 0.8305335, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.28833008, + "step": 2516, + "time_per_iteration": 2.5123801231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.05487967, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.05799188255481874, + "language_loss": 0.85305762, + "learning_rate": 0.0005497642410884014, + "loss": 0.86390138, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.29492188, + "step": 2517, + "time_per_iteration": 2.818969249725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107799, + "balance_loss_mlp": 1.04907012, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.0575391439282783, + "language_loss": 0.85093868, + "learning_rate": 0.0005494542360352085, + "loss": 0.8617186, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.28881836, + "step": 2518, + "time_per_iteration": 2.654691457748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081359, + "balance_loss_mlp": 1.05220056, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.06803778984218942, + "language_loss": 0.85824656, + "learning_rate": 0.0005491442117821783, + "loss": 0.86906004, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.29125977, + "step": 2519, + "time_per_iteration": 2.703547954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.0510273, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.12066852374350216, + "language_loss": 0.87487119, + "learning_rate": 0.0005488341684496732, + "loss": 0.88568664, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.3046875, + "step": 2520, + "time_per_iteration": 2.6539435386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107692, + "balance_loss_mlp": 1.04757047, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.05745701253476237, + "language_loss": 0.91846752, + "learning_rate": 0.0005485241061580624, + "loss": 0.92923677, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.29296875, + "step": 2521, + "time_per_iteration": 2.775069236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_mlp": 1.04995275, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05822253141450555, + "language_loss": 0.84573066, + "learning_rate": 0.0005482140250277228, + "loss": 0.8565352, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.3046875, + "step": 2522, + "time_per_iteration": 2.9740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_mlp": 1.05306387, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.06368999588379491, + "language_loss": 0.87678063, + "learning_rate": 0.0005479039251790387, + "loss": 0.88760674, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.29492188, + "step": 2523, + "time_per_iteration": 2.6360013484954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.05666256, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.060153636482772124, + "language_loss": 0.84925246, + "learning_rate": 0.0005475938067324014, + "loss": 0.8601191, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.29956055, + "step": 2524, + "time_per_iteration": 2.8053042888641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05542803, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.059684937302366806, + "language_loss": 0.83693206, + "learning_rate": 0.0005472836698082098, + "loss": 0.84777892, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.29199219, + "step": 2525, + "time_per_iteration": 2.513991355895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_mlp": 1.05587339, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.059033754749834536, + "language_loss": 0.84245414, + "learning_rate": 0.0005469735145268694, + "loss": 0.85330468, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.29174805, + "step": 2526, + "time_per_iteration": 2.758964776992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085929, + "balance_loss_mlp": 1.05712819, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.05692033512559974, + "language_loss": 0.80668163, + "learning_rate": 0.0005466633410087933, + "loss": 0.81754094, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.28808594, + "step": 2527, + "time_per_iteration": 2.7483773231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_mlp": 1.01712215, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.02025241925229164, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78289819, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11865234, + "step": 2528, + "time_per_iteration": 4.8671183586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084286, + "balance_loss_mlp": 1.05558062, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.060917910127877034, + "language_loss": 0.88050807, + "learning_rate": 0.0005460429397441214, + "loss": 0.89135092, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.28662109, + "step": 2529, + "time_per_iteration": 2.5488078594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.05416238, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06933582049293556, + "language_loss": 0.86551011, + "learning_rate": 0.0005457327122383866, + "loss": 0.87634516, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.29321289, + "step": 2530, + "time_per_iteration": 2.6199238300323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018983, + "balance_loss_mlp": 1.00711012, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.01657901033031013, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75655472, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.11865234, + "step": 2531, + "time_per_iteration": 4.810813665390015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.05754662, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.0731565805542322, + "language_loss": 0.75476754, + "learning_rate": 0.0005451122040823244, + "loss": 0.76563311, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.28979492, + "step": 2532, + "time_per_iteration": 2.7834720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0543766, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05844807259880667, + "language_loss": 0.7683785, + "learning_rate": 0.0005448019236728997, + "loss": 0.77921844, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.29589844, + "step": 2533, + "time_per_iteration": 2.9007680416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108612, + "balance_loss_mlp": 1.05789077, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.06352012335970622, + "language_loss": 0.84519851, + "learning_rate": 0.0005444916258698255, + "loss": 0.85605973, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.2824707, + "step": 2534, + "time_per_iteration": 2.6479434967041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083901, + "balance_loss_mlp": 1.05450428, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.06527387606118956, + "language_loss": 0.85987055, + "learning_rate": 0.0005441813107935704, + "loss": 0.8707096, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.29370117, + "step": 2535, + "time_per_iteration": 2.657701253890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.05359387, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05960574003717953, + "language_loss": 0.85425317, + "learning_rate": 0.0005438709785646091, + "loss": 0.86507541, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.28637695, + "step": 2536, + "time_per_iteration": 2.5686872005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081582, + "balance_loss_mlp": 1.05197084, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.0674154398441342, + "language_loss": 0.86857444, + "learning_rate": 0.0005435606293034234, + "loss": 0.87939024, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.29589844, + "step": 2537, + "time_per_iteration": 2.6792654991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108176, + "balance_loss_mlp": 1.05334091, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.1079718501079392, + "language_loss": 0.85096419, + "learning_rate": 0.0005432502631305016, + "loss": 0.86178184, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.28417969, + "step": 2538, + "time_per_iteration": 2.6790173053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082462, + "balance_loss_mlp": 1.05366075, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.270667674808598, + "language_loss": 0.83102262, + "learning_rate": 0.0005429398801663386, + "loss": 0.84184724, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.28808594, + "step": 2539, + "time_per_iteration": 2.9468812942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074127, + "balance_loss_mlp": 1.04453969, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.06499376102514318, + "language_loss": 0.82999051, + "learning_rate": 0.0005426294805314355, + "loss": 0.8407318, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.29541016, + "step": 2540, + "time_per_iteration": 4.142840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.04685867, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.055782244803189183, + "language_loss": 0.80130786, + "learning_rate": 0.0005423190643463003, + "loss": 0.81207728, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.30053711, + "step": 2541, + "time_per_iteration": 2.972822427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04237723, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.07101662394817357, + "language_loss": 0.83088171, + "learning_rate": 0.0005420086317314473, + "loss": 0.84160542, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.29956055, + "step": 2542, + "time_per_iteration": 2.651425838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.04180098, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.06479627692425034, + "language_loss": 0.81022084, + "learning_rate": 0.0005416981828073971, + "loss": 0.82094878, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.30957031, + "step": 2543, + "time_per_iteration": 2.775273323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111363, + "balance_loss_mlp": 1.09922981, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.045109342737372694, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78228641, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.14355469, + "step": 2544, + "time_per_iteration": 4.819438219070435 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.0383091, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.07868028775989613, + "language_loss": 0.85065794, + "learning_rate": 0.000541077236513819, + "loss": 0.86135024, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.30883789, + "step": 2545, + "time_per_iteration": 2.5191094875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.03981793, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.07130550478628667, + "language_loss": 0.82089663, + "learning_rate": 0.0005407667393853638, + "loss": 0.83161378, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31884766, + "step": 2546, + "time_per_iteration": 2.617934465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107245, + "balance_loss_mlp": 1.04043055, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.07826700951116618, + "language_loss": 0.8301416, + "learning_rate": 0.0005404562264298569, + "loss": 0.84086609, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32006836, + "step": 2547, + "time_per_iteration": 2.8667449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.03946531, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.06922547112322346, + "language_loss": 0.83528513, + "learning_rate": 0.0005401456977678498, + "loss": 0.8460055, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.32568359, + "step": 2548, + "time_per_iteration": 2.6317896842956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073611, + "balance_loss_mlp": 1.04216361, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06685231557649787, + "language_loss": 0.77518535, + "learning_rate": 0.0005398351535199008, + "loss": 0.78592145, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.31420898, + "step": 2549, + "time_per_iteration": 3.0532455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.046422, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.058433753989977806, + "language_loss": 0.83942944, + "learning_rate": 0.0005395245938065735, + "loss": 0.85020411, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31030273, + "step": 2550, + "time_per_iteration": 2.788081169128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082711, + "balance_loss_mlp": 1.0515734, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.08029752654472934, + "language_loss": 0.83026552, + "learning_rate": 0.0005392140187484379, + "loss": 0.84109271, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.3112793, + "step": 2551, + "time_per_iteration": 2.619982957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076344, + "balance_loss_mlp": 1.04577839, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.05951944251734202, + "language_loss": 0.89720619, + "learning_rate": 0.0005389034284660701, + "loss": 0.90796959, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.30541992, + "step": 2552, + "time_per_iteration": 2.811321258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084609, + "balance_loss_mlp": 1.05349529, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.06813620439924545, + "language_loss": 0.82330388, + "learning_rate": 0.000538592823080052, + "loss": 0.83414996, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.31079102, + "step": 2553, + "time_per_iteration": 3.121729612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_mlp": 1.05181932, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.10151417402847059, + "language_loss": 0.84795117, + "learning_rate": 0.000538282202710971, + "loss": 0.85879219, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.32275391, + "step": 2554, + "time_per_iteration": 2.5441434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089823, + "balance_loss_mlp": 1.05782735, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.08391436989004458, + "language_loss": 0.81955588, + "learning_rate": 0.000537971567479421, + "loss": 0.83045411, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.31982422, + "step": 2555, + "time_per_iteration": 2.742913246154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.05578029, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.0678126955236607, + "language_loss": 0.87735516, + "learning_rate": 0.0005376609175060011, + "loss": 0.88824058, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32763672, + "step": 2556, + "time_per_iteration": 2.5964388847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.05774164, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06456480219532172, + "language_loss": 0.80659723, + "learning_rate": 0.0005373502529113162, + "loss": 0.81748366, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.30883789, + "step": 2557, + "time_per_iteration": 2.8043599128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092017, + "balance_loss_mlp": 1.06009305, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.08818279105065703, + "language_loss": 0.81143486, + "learning_rate": 0.0005370395738159773, + "loss": 0.82235509, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.3190918, + "step": 2558, + "time_per_iteration": 2.6536951065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086446, + "balance_loss_mlp": 1.05516589, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.0699028851556838, + "language_loss": 0.83194804, + "learning_rate": 0.0005367288803406003, + "loss": 0.84281248, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3125, + "step": 2559, + "time_per_iteration": 2.6608238220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06075501, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05624800088650225, + "language_loss": 0.81485915, + "learning_rate": 0.0005364181726058073, + "loss": 0.82578236, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.31542969, + "step": 2560, + "time_per_iteration": 2.7245399951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_mlp": 1.05354452, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.0657433103973406, + "language_loss": 0.82255721, + "learning_rate": 0.0005361074507322261, + "loss": 0.83340329, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.31030273, + "step": 2561, + "time_per_iteration": 2.632309913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05359399, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.06588348626271129, + "language_loss": 0.81683809, + "learning_rate": 0.000535796714840489, + "loss": 0.82768893, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.31494141, + "step": 2562, + "time_per_iteration": 2.6455063819885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107827, + "balance_loss_mlp": 1.04686987, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.07506734855649709, + "language_loss": 0.84067267, + "learning_rate": 0.0005354859650512348, + "loss": 0.85145533, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.3137207, + "step": 2563, + "time_per_iteration": 2.8065779209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075102, + "balance_loss_mlp": 1.04396451, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06295276436461052, + "language_loss": 0.87103295, + "learning_rate": 0.0005351752014851074, + "loss": 0.88178396, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31103516, + "step": 2564, + "time_per_iteration": 2.573575019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078018, + "balance_loss_mlp": 1.04654717, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06464744293940616, + "language_loss": 0.83104938, + "learning_rate": 0.0005348644242627553, + "loss": 0.84182954, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.31445312, + "step": 2565, + "time_per_iteration": 2.730455160140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_mlp": 1.0458622, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.030733727476311833, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76345742, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.1328125, + "step": 2566, + "time_per_iteration": 4.939255237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05290508, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.06048394989907295, + "language_loss": 0.81127739, + "learning_rate": 0.0005342428293320013, + "loss": 0.82211566, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30908203, + "step": 2567, + "time_per_iteration": 2.7613086700439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079847, + "balance_loss_mlp": 1.04899621, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.0745931351859795, + "language_loss": 0.83762527, + "learning_rate": 0.0005339320118649238, + "loss": 0.84842372, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.30810547, + "step": 2568, + "time_per_iteration": 2.6934940814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.04763281, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.16404827309636982, + "language_loss": 0.86383307, + "learning_rate": 0.000533621181224271, + "loss": 0.87461007, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30053711, + "step": 2569, + "time_per_iteration": 2.7757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078612, + "balance_loss_mlp": 1.04737914, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.06859593656518678, + "language_loss": 0.81795698, + "learning_rate": 0.0005333103375307182, + "loss": 0.8287431, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.31201172, + "step": 2570, + "time_per_iteration": 2.8319950103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_mlp": 1.043221, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.05293986738306163, + "language_loss": 0.86142224, + "learning_rate": 0.0005329994809049451, + "loss": 0.87216723, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.3125, + "step": 2571, + "time_per_iteration": 2.7592415809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075993, + "balance_loss_mlp": 1.04540396, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05076322771290774, + "language_loss": 0.87883997, + "learning_rate": 0.0005326886114676375, + "loss": 0.88959992, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.30541992, + "step": 2572, + "time_per_iteration": 2.9501779079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077876, + "balance_loss_mlp": 1.0463568, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.06323365720535751, + "language_loss": 0.87792003, + "learning_rate": 0.0005323777293394854, + "loss": 0.8886987, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.31494141, + "step": 2573, + "time_per_iteration": 2.55361008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107249, + "balance_loss_mlp": 1.03975475, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.05535210432037286, + "language_loss": 0.81776071, + "learning_rate": 0.000532066834641184, + "loss": 0.82848555, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32739258, + "step": 2574, + "time_per_iteration": 2.6631722450256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070737, + "balance_loss_mlp": 1.03900313, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.06817735062049093, + "language_loss": 0.8516283, + "learning_rate": 0.0005317559274934334, + "loss": 0.86233568, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.31713867, + "step": 2575, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072086, + "balance_loss_mlp": 1.03894639, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.05802348124776455, + "language_loss": 0.80394173, + "learning_rate": 0.0005314450080169382, + "loss": 0.81466264, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33154297, + "step": 2576, + "time_per_iteration": 2.6343159675598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076196, + "balance_loss_mlp": 1.04391456, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.07974947058861337, + "language_loss": 0.80607754, + "learning_rate": 0.0005311340763324083, + "loss": 0.81683946, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.32275391, + "step": 2577, + "time_per_iteration": 2.557796001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078498, + "balance_loss_mlp": 1.04557252, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.05295897633494548, + "language_loss": 0.82240456, + "learning_rate": 0.0005308231325605578, + "loss": 0.83318955, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.32910156, + "step": 2578, + "time_per_iteration": 2.6799750328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072444, + "balance_loss_mlp": 1.03992367, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.05054804003557779, + "language_loss": 0.7645728, + "learning_rate": 0.0005305121768221061, + "loss": 0.77529716, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.32519531, + "step": 2579, + "time_per_iteration": 3.074568748474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_mlp": 1.01057923, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02258142627415349, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76063395, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14453125, + "step": 2580, + "time_per_iteration": 4.807044267654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079853, + "balance_loss_mlp": 1.04749966, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.06889886772880317, + "language_loss": 0.9145242, + "learning_rate": 0.0005298902299282984, + "loss": 0.92532271, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.32348633, + "step": 2581, + "time_per_iteration": 2.6145668029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077544, + "balance_loss_mlp": 1.04561996, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.06407878407439609, + "language_loss": 0.84137404, + "learning_rate": 0.0005295792390144033, + "loss": 0.85214949, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.3190918, + "step": 2582, + "time_per_iteration": 2.71272873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083171, + "balance_loss_mlp": 1.05103219, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.07436197165654145, + "language_loss": 0.83241105, + "learning_rate": 0.0005292682366168294, + "loss": 0.84324276, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.32128906, + "step": 2583, + "time_per_iteration": 2.5284125804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082483, + "balance_loss_mlp": 1.05079746, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.07965760723765093, + "language_loss": 0.79750967, + "learning_rate": 0.0005289572228563181, + "loss": 0.80833459, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.31665039, + "step": 2584, + "time_per_iteration": 2.802370548248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.04862666, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.06536047089469768, + "language_loss": 0.83144403, + "learning_rate": 0.000528646197853616, + "loss": 0.84224886, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.31835938, + "step": 2585, + "time_per_iteration": 2.7075467109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076886, + "balance_loss_mlp": 1.04748917, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.11136041462628715, + "language_loss": 0.85364115, + "learning_rate": 0.0005283351617294735, + "loss": 0.86440998, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.29370117, + "step": 2586, + "time_per_iteration": 2.940826892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_mlp": 1.0143584, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01813039431029953, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.7766428, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.1328125, + "step": 2587, + "time_per_iteration": 4.996971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082207, + "balance_loss_mlp": 1.05278599, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05663819997496981, + "language_loss": 0.86729956, + "learning_rate": 0.0005277130565998916, + "loss": 0.87812161, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.29394531, + "step": 2588, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_mlp": 1.05401921, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.07264241635107661, + "language_loss": 0.82111955, + "learning_rate": 0.0005274019878359748, + "loss": 0.83195567, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.29541016, + "step": 2589, + "time_per_iteration": 2.7199792861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081352, + "balance_loss_mlp": 1.05102515, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.07554474334702437, + "language_loss": 0.86675328, + "learning_rate": 0.0005270909084336628, + "loss": 0.87756681, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.30297852, + "step": 2590, + "time_per_iteration": 2.6305181980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080877, + "balance_loss_mlp": 1.05045462, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.06751539177219479, + "language_loss": 0.89032745, + "learning_rate": 0.0005267798185137276, + "loss": 0.90113628, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.30371094, + "step": 2591, + "time_per_iteration": 2.608088254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_mlp": 1.05743146, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.0633807963563003, + "language_loss": 0.8924402, + "learning_rate": 0.0005264687181969444, + "loss": 0.90332258, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.30786133, + "step": 2592, + "time_per_iteration": 2.729546308517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088496, + "balance_loss_mlp": 1.05931377, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06112732681279078, + "language_loss": 0.75084651, + "learning_rate": 0.0005261576076040937, + "loss": 0.76173151, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.29199219, + "step": 2593, + "time_per_iteration": 3.265289783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082947, + "balance_loss_mlp": 1.05281067, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.0783599565062882, + "language_loss": 0.84088343, + "learning_rate": 0.0005258464868559591, + "loss": 0.85171294, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.30078125, + "step": 2594, + "time_per_iteration": 2.657191514968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080029, + "balance_loss_mlp": 1.04991674, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.0699675322535813, + "language_loss": 0.88836402, + "learning_rate": 0.0005255353560733284, + "loss": 0.89916426, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.30102539, + "step": 2595, + "time_per_iteration": 2.570439100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_mlp": 1.04640186, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.029272008197333242, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76637447, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.12353516, + "step": 2596, + "time_per_iteration": 4.808587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084167, + "balance_loss_mlp": 1.05476975, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052965599041123274, + "language_loss": 0.83342099, + "learning_rate": 0.0005249130648877492, + "loss": 0.84426272, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.29370117, + "step": 2597, + "time_per_iteration": 2.7453384399414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010849, + "balance_loss_mlp": 1.05524063, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05960347084431116, + "language_loss": 0.84714389, + "learning_rate": 0.0005246019047263953, + "loss": 0.85799289, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.29614258, + "step": 2598, + "time_per_iteration": 2.488004684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091385, + "balance_loss_mlp": 1.06220269, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.06961248878544336, + "language_loss": 0.8223601, + "learning_rate": 0.0005242907350137353, + "loss": 0.83327389, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.29174805, + "step": 2599, + "time_per_iteration": 2.550495147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092431, + "balance_loss_mlp": 1.06422567, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06813860338073652, + "language_loss": 0.78928339, + "learning_rate": 0.0005239795558705754, + "loss": 0.80020773, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.28198242, + "step": 2600, + "time_per_iteration": 2.656519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094846, + "balance_loss_mlp": 1.06492448, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05508549334218052, + "language_loss": 0.89073658, + "learning_rate": 0.0005236683674177264, + "loss": 0.90168506, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.29907227, + "step": 2601, + "time_per_iteration": 2.63960337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098261, + "balance_loss_mlp": 1.06886423, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.06683201790232274, + "language_loss": 0.82384604, + "learning_rate": 0.0005233571697760021, + "loss": 0.83482862, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.29345703, + "step": 2602, + "time_per_iteration": 2.859165668487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06814075, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.06216601268510387, + "language_loss": 0.83124363, + "learning_rate": 0.0005230459630662203, + "loss": 0.84222066, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.29541016, + "step": 2603, + "time_per_iteration": 2.9592032432556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093592, + "balance_loss_mlp": 1.06479144, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.0707725537041266, + "language_loss": 0.81070089, + "learning_rate": 0.0005227347474092022, + "loss": 0.8216368, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.2878418, + "step": 2604, + "time_per_iteration": 2.7389962673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545365, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.05232832672790962, + "language_loss": 0.83514917, + "learning_rate": 0.0005224235229257724, + "loss": 0.84609556, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.29174805, + "step": 2605, + "time_per_iteration": 2.687992811203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.05914283, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.056206575952308185, + "language_loss": 0.8630116, + "learning_rate": 0.0005221122897367589, + "loss": 0.87389988, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.29614258, + "step": 2606, + "time_per_iteration": 2.787410259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.05861855, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.07695466326694751, + "language_loss": 0.81035262, + "learning_rate": 0.0005218010479629932, + "loss": 0.82123399, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.29467773, + "step": 2607, + "time_per_iteration": 2.6562912464141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.06177175, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.05799380231795743, + "language_loss": 0.81869501, + "learning_rate": 0.0005214897977253102, + "loss": 0.82961148, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.29833984, + "step": 2608, + "time_per_iteration": 2.6560218334198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_mlp": 1.05454254, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.06343008203006618, + "language_loss": 0.84223098, + "learning_rate": 0.0005211785391445473, + "loss": 0.85307777, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.30102539, + "step": 2609, + "time_per_iteration": 2.726686954498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_mlp": 1.05202734, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.06012661278609564, + "language_loss": 0.79186547, + "learning_rate": 0.0005208672723415467, + "loss": 0.80267924, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.29345703, + "step": 2610, + "time_per_iteration": 2.7944774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108238, + "balance_loss_mlp": 1.05212474, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.06559501481836318, + "language_loss": 0.79065204, + "learning_rate": 0.0005205559974371525, + "loss": 0.80147582, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.30224609, + "step": 2611, + "time_per_iteration": 2.7519257068634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081519, + "balance_loss_mlp": 1.05150175, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05612255210767107, + "language_loss": 0.82192892, + "learning_rate": 0.0005202447145522123, + "loss": 0.83274412, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.29980469, + "step": 2612, + "time_per_iteration": 2.6770236492156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079077, + "balance_loss_mlp": 1.04965591, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05250196134528315, + "language_loss": 0.79193181, + "learning_rate": 0.0005199334238075769, + "loss": 0.80272257, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.29370117, + "step": 2613, + "time_per_iteration": 2.5337562561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107987, + "balance_loss_mlp": 1.04942441, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.0529792440436354, + "language_loss": 0.9204368, + "learning_rate": 0.0005196221253241, + "loss": 0.93123555, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.30419922, + "step": 2614, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04276693, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.06195019445138367, + "language_loss": 0.82918042, + "learning_rate": 0.0005193108192226383, + "loss": 0.83991992, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.31152344, + "step": 2615, + "time_per_iteration": 2.757087230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080642, + "balance_loss_mlp": 1.04990983, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.05317989185447873, + "language_loss": 0.8697142, + "learning_rate": 0.000518999505624052, + "loss": 0.88052064, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.30712891, + "step": 2616, + "time_per_iteration": 2.7251224517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078998, + "balance_loss_mlp": 1.04759884, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.059314577611761586, + "language_loss": 0.83379316, + "learning_rate": 0.000518688184649203, + "loss": 0.84458327, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.3137207, + "step": 2617, + "time_per_iteration": 2.809063673019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107933, + "balance_loss_mlp": 1.04890776, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.08232681701976922, + "language_loss": 0.83759677, + "learning_rate": 0.0005183768564189577, + "loss": 0.8483901, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.30395508, + "step": 2618, + "time_per_iteration": 2.5442681312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_mlp": 1.05502236, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.10233936422342303, + "language_loss": 0.81248713, + "learning_rate": 0.0005180655210541838, + "loss": 0.8233487, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31103516, + "step": 2619, + "time_per_iteration": 2.5986533164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04976153, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.10286286455085811, + "language_loss": 0.83096433, + "learning_rate": 0.0005177541786757527, + "loss": 0.84175664, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.29443359, + "step": 2620, + "time_per_iteration": 2.7542781829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04971933, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.062363268760676084, + "language_loss": 0.82867718, + "learning_rate": 0.000517442829404538, + "loss": 0.83948314, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.30834961, + "step": 2621, + "time_per_iteration": 2.9758973121643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080161, + "balance_loss_mlp": 1.05000091, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.06818258917584033, + "language_loss": 0.8721652, + "learning_rate": 0.0005171314733614166, + "loss": 0.88296676, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.30102539, + "step": 2622, + "time_per_iteration": 2.8933780193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082583, + "balance_loss_mlp": 1.05235183, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.06917321427090362, + "language_loss": 0.78315443, + "learning_rate": 0.0005168201106672671, + "loss": 0.79398024, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.30200195, + "step": 2623, + "time_per_iteration": 2.763855457305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.05093241, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.06294733427077812, + "language_loss": 0.84776348, + "learning_rate": 0.0005165087414429717, + "loss": 0.85857534, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.30200195, + "step": 2624, + "time_per_iteration": 2.6454148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04967785, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.07820570667172376, + "language_loss": 0.83597136, + "learning_rate": 0.0005161973658094144, + "loss": 0.84677643, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.30810547, + "step": 2625, + "time_per_iteration": 2.630192756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075312, + "balance_loss_mlp": 1.04562938, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.10754310805258371, + "language_loss": 0.8215518, + "learning_rate": 0.000515885983887482, + "loss": 0.83230495, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.29614258, + "step": 2626, + "time_per_iteration": 2.762484312057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082022, + "balance_loss_mlp": 1.05179107, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.060931372363222436, + "language_loss": 0.84606075, + "learning_rate": 0.0005155745957980636, + "loss": 0.85688096, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.30175781, + "step": 2627, + "time_per_iteration": 2.597625494003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04513431, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.060140239439456865, + "language_loss": 0.8829447, + "learning_rate": 0.000515263201662051, + "loss": 0.89370334, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.30688477, + "step": 2628, + "time_per_iteration": 2.676429510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081664, + "balance_loss_mlp": 1.05162382, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05201747216110034, + "language_loss": 0.82525623, + "learning_rate": 0.0005149518016003378, + "loss": 0.83607286, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.30004883, + "step": 2629, + "time_per_iteration": 3.1674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.04874492, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.12452297981638945, + "language_loss": 0.82290918, + "learning_rate": 0.0005146403957338206, + "loss": 0.83369756, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30029297, + "step": 2630, + "time_per_iteration": 2.574908494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075266, + "balance_loss_mlp": 1.04415226, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.054026792513587725, + "language_loss": 0.81795335, + "learning_rate": 0.0005143289841833975, + "loss": 0.82870597, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31079102, + "step": 2631, + "time_per_iteration": 2.8753445148468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.04044628, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.07665080268010696, + "language_loss": 0.82169271, + "learning_rate": 0.0005140175670699696, + "loss": 0.83241099, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.31347656, + "step": 2632, + "time_per_iteration": 2.606656551361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070677, + "balance_loss_mlp": 1.03989697, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.05365826465054309, + "language_loss": 0.82773447, + "learning_rate": 0.0005137061445144395, + "loss": 0.83844125, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.30737305, + "step": 2633, + "time_per_iteration": 2.908146619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.0429641, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.06908817272508659, + "language_loss": 0.87031686, + "learning_rate": 0.000513394716637712, + "loss": 0.88106334, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.31665039, + "step": 2634, + "time_per_iteration": 2.804591417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03547585, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.027149993512400487, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80241489, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.14257812, + "step": 2635, + "time_per_iteration": 4.903238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.03977799, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.05829667092367474, + "language_loss": 0.80886006, + "learning_rate": 0.0005127718454042958, + "loss": 0.81957495, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.31689453, + "step": 2636, + "time_per_iteration": 2.81962513923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076357, + "balance_loss_mlp": 1.04467094, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.06782185148260642, + "language_loss": 0.84239292, + "learning_rate": 0.0005124604022894269, + "loss": 0.85315657, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.31665039, + "step": 2637, + "time_per_iteration": 2.933143377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023059, + "balance_loss_mlp": 1.00932586, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.016037159370544805, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78211284, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.13769531, + "step": 2638, + "time_per_iteration": 4.81339168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080028, + "balance_loss_mlp": 1.04786575, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.058900205072543066, + "language_loss": 0.83262694, + "learning_rate": 0.0005118375016679325, + "loss": 0.84342724, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.3215332, + "step": 2639, + "time_per_iteration": 2.7476773262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076278, + "balance_loss_mlp": 1.04490256, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.08436499818571505, + "language_loss": 0.80410182, + "learning_rate": 0.0005115260444031382, + "loss": 0.81486464, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.31347656, + "step": 2640, + "time_per_iteration": 2.579087734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016776, + "balance_loss_mlp": 1.00361574, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.010326775178219767, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79748595, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.13183594, + "step": 2641, + "time_per_iteration": 4.939114809036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077717, + "balance_loss_mlp": 1.04665077, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.06392423646026814, + "language_loss": 0.86441147, + "learning_rate": 0.0005109031165700483, + "loss": 0.87518859, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.31030273, + "step": 2642, + "time_per_iteration": 2.572248935699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.04809904, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.08514760687851525, + "language_loss": 0.83290648, + "learning_rate": 0.0005105916462435945, + "loss": 0.84369576, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.30786133, + "step": 2643, + "time_per_iteration": 2.832653284072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.05089569, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.05584396132467612, + "language_loss": 0.85012162, + "learning_rate": 0.0005102801718050989, + "loss": 0.86093414, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.30322266, + "step": 2644, + "time_per_iteration": 2.6693568229675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.04755831, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.07396400679887168, + "language_loss": 0.89154196, + "learning_rate": 0.0005099686933754867, + "loss": 0.9023155, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.29785156, + "step": 2645, + "time_per_iteration": 2.688992977142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080157, + "balance_loss_mlp": 1.05016422, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.06521042739972126, + "language_loss": 0.84349567, + "learning_rate": 0.0005096572110756845, + "loss": 0.85429722, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.29956055, + "step": 2646, + "time_per_iteration": 2.694018840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080367, + "balance_loss_mlp": 1.05065989, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.049776737751643374, + "language_loss": 0.85623205, + "learning_rate": 0.0005093457250266205, + "loss": 0.86703575, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.296875, + "step": 2647, + "time_per_iteration": 2.69240665435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_mlp": 1.05527472, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.0639130152108818, + "language_loss": 0.83146644, + "learning_rate": 0.000509034235349224, + "loss": 0.84231722, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.29760742, + "step": 2648, + "time_per_iteration": 2.69409441947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_mlp": 1.05499578, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.07990516858852505, + "language_loss": 0.81340408, + "learning_rate": 0.0005087227421644266, + "loss": 0.82424831, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.29345703, + "step": 2649, + "time_per_iteration": 2.7338664531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.05795491, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.06481094949829869, + "language_loss": 0.86482179, + "learning_rate": 0.0005084112455931602, + "loss": 0.87570059, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.29907227, + "step": 2650, + "time_per_iteration": 2.5772013664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085843, + "balance_loss_mlp": 1.05561161, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.060404574220966636, + "language_loss": 0.84966755, + "learning_rate": 0.0005080997457563586, + "loss": 0.86052603, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.30200195, + "step": 2651, + "time_per_iteration": 2.5539023876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089212, + "balance_loss_mlp": 1.05895662, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06895787175374923, + "language_loss": 0.79026747, + "learning_rate": 0.0005077882427749569, + "loss": 0.80115962, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.30224609, + "step": 2652, + "time_per_iteration": 2.5036137104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.06367242, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06232251007114316, + "language_loss": 0.84676695, + "learning_rate": 0.0005074767367698913, + "loss": 0.85770237, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.29833984, + "step": 2653, + "time_per_iteration": 2.6879539489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088747, + "balance_loss_mlp": 1.05875421, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.07002300864013745, + "language_loss": 0.83262461, + "learning_rate": 0.0005071652278620988, + "loss": 0.84351206, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.29956055, + "step": 2654, + "time_per_iteration": 3.048330307006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093234, + "balance_loss_mlp": 1.06369376, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.077240918193036, + "language_loss": 0.83515394, + "learning_rate": 0.0005068537161725186, + "loss": 0.84608626, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.29492188, + "step": 2655, + "time_per_iteration": 2.7864887714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088669, + "balance_loss_mlp": 1.05941546, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.06396168128091786, + "language_loss": 0.84455109, + "learning_rate": 0.0005065422018220893, + "loss": 0.85543782, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.29223633, + "step": 2656, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095041, + "balance_loss_mlp": 1.0650475, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.0709037558233959, + "language_loss": 0.7998327, + "learning_rate": 0.0005062306849317521, + "loss": 0.81078309, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.29956055, + "step": 2657, + "time_per_iteration": 2.7980425357818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010852, + "balance_loss_mlp": 1.05484891, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.0652959904845647, + "language_loss": 0.83424717, + "learning_rate": 0.0005059191656224487, + "loss": 0.84509915, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30297852, + "step": 2658, + "time_per_iteration": 2.735557794570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_mlp": 1.05488813, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.05645977889013881, + "language_loss": 0.89198554, + "learning_rate": 0.0005056076440151212, + "loss": 0.90283966, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.3046875, + "step": 2659, + "time_per_iteration": 2.651273012161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136875, + "balance_loss_mlp": 1.12314212, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.05420368374393455, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77424991, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.13769531, + "step": 2660, + "time_per_iteration": 4.8447229862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_mlp": 1.05689311, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.04523661755748661, + "language_loss": 0.87268543, + "learning_rate": 0.0005049845943901691, + "loss": 0.88354003, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.28515625, + "step": 2661, + "time_per_iteration": 2.855107307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.05092359, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05522645200412479, + "language_loss": 0.86379933, + "learning_rate": 0.0005046730666144338, + "loss": 0.87459898, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.2902832, + "step": 2662, + "time_per_iteration": 2.841339349746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082682, + "balance_loss_mlp": 1.05390453, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.05374936854204756, + "language_loss": 0.87915027, + "learning_rate": 0.0005043615370244532, + "loss": 0.8899771, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.2878418, + "step": 2663, + "time_per_iteration": 3.364856004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03728747, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.022479341124125186, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79294169, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.125, + "step": 2664, + "time_per_iteration": 4.635313510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080439, + "balance_loss_mlp": 1.05163848, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04479435391735135, + "language_loss": 0.85200715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86281157, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.28808594, + "step": 2665, + "time_per_iteration": 2.7995188236236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083297, + "balance_loss_mlp": 1.05356586, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.0801864670549744, + "language_loss": 0.84280151, + "learning_rate": 0.0005034269385785075, + "loss": 0.85363448, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.29711914, + "step": 2666, + "time_per_iteration": 2.673332929611206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090699, + "balance_loss_mlp": 1.0623982, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06501156427369086, + "language_loss": 0.84454274, + "learning_rate": 0.0005031154029410168, + "loss": 0.85544968, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.28344727, + "step": 2667, + "time_per_iteration": 2.5442566871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086564, + "balance_loss_mlp": 1.0577395, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06480382372099369, + "language_loss": 0.86841118, + "learning_rate": 0.0005028038660940197, + "loss": 0.87927675, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.28808594, + "step": 2668, + "time_per_iteration": 2.62888765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077032, + "balance_loss_mlp": 1.04832673, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.05084400085528349, + "language_loss": 0.84573722, + "learning_rate": 0.0005024923281584648, + "loss": 0.85650754, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.28662109, + "step": 2669, + "time_per_iteration": 2.6316568851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092041, + "balance_loss_mlp": 1.06312072, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.05870793453685439, + "language_loss": 0.82656723, + "learning_rate": 0.0005021807892553026, + "loss": 0.83748764, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.28881836, + "step": 2670, + "time_per_iteration": 2.707345724105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093085, + "balance_loss_mlp": 1.06457078, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.08829821247143162, + "language_loss": 0.84517181, + "learning_rate": 0.0005018692495054828, + "loss": 0.85610259, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.28540039, + "step": 2671, + "time_per_iteration": 2.758309841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092768, + "balance_loss_mlp": 1.06399131, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05555500929459815, + "language_loss": 0.80821186, + "learning_rate": 0.0005015577090299561, + "loss": 0.8191396, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.28735352, + "step": 2672, + "time_per_iteration": 2.6883137226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.06125236, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.06705414985084517, + "language_loss": 0.86672199, + "learning_rate": 0.0005012461679496729, + "loss": 0.87762225, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.28759766, + "step": 2673, + "time_per_iteration": 2.5949177742004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092599, + "balance_loss_mlp": 1.0630827, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.06054107713253035, + "language_loss": 0.87204134, + "learning_rate": 0.0005009346263855848, + "loss": 0.88296735, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.29467773, + "step": 2674, + "time_per_iteration": 2.6084070205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093368, + "balance_loss_mlp": 1.06401849, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.08912792131396882, + "language_loss": 0.83928424, + "learning_rate": 0.0005006230844586422, + "loss": 0.85021788, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.29345703, + "step": 2675, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06496692, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.06185145068902706, + "language_loss": 0.79025733, + "learning_rate": 0.0005003115422897968, + "loss": 0.80119741, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.29052734, + "step": 2676, + "time_per_iteration": 2.7350447177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_mlp": 1.05780196, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.06610854708750855, + "language_loss": 0.86982405, + "learning_rate": 0.0005, + "loss": 0.88070583, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.30322266, + "step": 2677, + "time_per_iteration": 2.62941837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082976, + "balance_loss_mlp": 1.0535078, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.05650592481949535, + "language_loss": 0.7918483, + "learning_rate": 0.0004996884577102033, + "loss": 0.80267811, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.29418945, + "step": 2678, + "time_per_iteration": 3.1128311157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085723, + "balance_loss_mlp": 1.05577731, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.05289591163695072, + "language_loss": 0.84550285, + "learning_rate": 0.000499376915541358, + "loss": 0.85636008, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.29907227, + "step": 2679, + "time_per_iteration": 2.709259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_mlp": 1.0510838, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.05812477607611756, + "language_loss": 0.81116259, + "learning_rate": 0.0004990653736144155, + "loss": 0.82198453, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31079102, + "step": 2680, + "time_per_iteration": 2.8433125019073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083796, + "balance_loss_mlp": 1.05318332, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.06443376303588658, + "language_loss": 0.8582924, + "learning_rate": 0.0004987538320503271, + "loss": 0.86913037, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.30566406, + "step": 2681, + "time_per_iteration": 2.492128372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04860437, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.06119575969443392, + "language_loss": 0.83057904, + "learning_rate": 0.0004984422909700442, + "loss": 0.84137553, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.31005859, + "step": 2682, + "time_per_iteration": 2.6817965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04560328, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.06357079240733023, + "language_loss": 0.83849651, + "learning_rate": 0.0004981307504945173, + "loss": 0.84926826, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31542969, + "step": 2683, + "time_per_iteration": 2.6884219646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04764211, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.058627663819765745, + "language_loss": 0.89028186, + "learning_rate": 0.0004978192107446976, + "loss": 0.90106535, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.30664062, + "step": 2684, + "time_per_iteration": 2.7606394290924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074512, + "balance_loss_mlp": 1.04397011, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05338243685455816, + "language_loss": 0.870161, + "learning_rate": 0.0004975076718415353, + "loss": 0.88090611, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30493164, + "step": 2685, + "time_per_iteration": 2.594937562942505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081075, + "balance_loss_mlp": 1.04991364, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.06078629774986462, + "language_loss": 0.90568233, + "learning_rate": 0.0004971961339059806, + "loss": 0.91649306, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.3112793, + "step": 2686, + "time_per_iteration": 2.4705729484558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075772, + "balance_loss_mlp": 1.04406273, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.067622669815522, + "language_loss": 0.83813852, + "learning_rate": 0.0004968845970589832, + "loss": 0.84889627, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.31689453, + "step": 2687, + "time_per_iteration": 2.6784517765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108779, + "balance_loss_mlp": 1.05760634, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06982295057413529, + "language_loss": 0.84568465, + "learning_rate": 0.0004965730614214926, + "loss": 0.85656255, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.30151367, + "step": 2688, + "time_per_iteration": 2.628742218017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078435, + "balance_loss_mlp": 1.0470829, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.06558972316908819, + "language_loss": 0.85422957, + "learning_rate": 0.0004962615271144576, + "loss": 0.86501396, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.31323242, + "step": 2689, + "time_per_iteration": 2.5566818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079558, + "balance_loss_mlp": 1.04923093, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.32559574880762837, + "language_loss": 0.82639515, + "learning_rate": 0.0004959499942588264, + "loss": 0.83719069, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.30273438, + "step": 2690, + "time_per_iteration": 2.8994317054748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057487, + "balance_loss_mlp": 1.04442203, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.028996752449645728, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79257512, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.13085938, + "step": 2691, + "time_per_iteration": 4.746784687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109471, + "balance_loss_mlp": 1.07830977, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.12339515707636219, + "language_loss": 0.85558736, + "learning_rate": 0.0004953269333855661, + "loss": 0.86668211, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.3112793, + "step": 2692, + "time_per_iteration": 2.8191914558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07991028, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.07785846219337349, + "language_loss": 0.84034789, + "learning_rate": 0.0004950154056098309, + "loss": 0.85143995, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.29272461, + "step": 2693, + "time_per_iteration": 2.686821222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129818, + "balance_loss_mlp": 1.09963465, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.07144537100010277, + "language_loss": 0.83820134, + "learning_rate": 0.0004947038797692867, + "loss": 0.84949952, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.30126953, + "step": 2694, + "time_per_iteration": 2.8041090965270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128051, + "balance_loss_mlp": 1.09741426, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.06183052783496024, + "language_loss": 0.77540803, + "learning_rate": 0.0004943923559848789, + "loss": 0.78668851, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.3059082, + "step": 2695, + "time_per_iteration": 2.797661781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127895, + "balance_loss_mlp": 1.09756875, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.054443821670517534, + "language_loss": 0.90626478, + "learning_rate": 0.0004940808343775515, + "loss": 0.91754371, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.30297852, + "step": 2696, + "time_per_iteration": 2.708075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126092, + "balance_loss_mlp": 1.09593177, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.08653085411735448, + "language_loss": 0.82187402, + "learning_rate": 0.0004937693150682479, + "loss": 0.83313495, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.30126953, + "step": 2697, + "time_per_iteration": 2.5607407093048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116261, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.07683001308624603, + "language_loss": 0.76774538, + "learning_rate": 0.0004934577981779107, + "loss": 0.77890801, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.30175781, + "step": 2698, + "time_per_iteration": 2.730090618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112238, + "balance_loss_mlp": 1.0813148, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.05605263998280499, + "language_loss": 0.81117129, + "learning_rate": 0.0004931462838274817, + "loss": 0.82229376, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.30883789, + "step": 2699, + "time_per_iteration": 2.847720146179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_mlp": 1.07957006, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.0574424557407856, + "language_loss": 0.84004086, + "learning_rate": 0.0004928347721379011, + "loss": 0.85114038, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.30322266, + "step": 2700, + "time_per_iteration": 2.6999762058258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_mlp": 1.07185948, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.05483286228362013, + "language_loss": 0.82044077, + "learning_rate": 0.0004925232632301089, + "loss": 0.83146882, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.30908203, + "step": 2701, + "time_per_iteration": 2.560593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098243, + "balance_loss_mlp": 1.06791615, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.06379159996009351, + "language_loss": 0.79575932, + "learning_rate": 0.0004922117572250431, + "loss": 0.80674177, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.30273438, + "step": 2702, + "time_per_iteration": 2.6621010303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094553, + "balance_loss_mlp": 1.0648458, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.06234734694325623, + "language_loss": 0.80990833, + "learning_rate": 0.0004919002542436414, + "loss": 0.82085389, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.296875, + "step": 2703, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.06806874, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.11086337696641164, + "language_loss": 0.81129456, + "learning_rate": 0.0004915887544068399, + "loss": 0.82227564, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.29980469, + "step": 2704, + "time_per_iteration": 2.6579208374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097204, + "balance_loss_mlp": 1.06787837, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.06500287710368027, + "language_loss": 0.78155613, + "learning_rate": 0.0004912772578355736, + "loss": 0.79252815, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.29296875, + "step": 2705, + "time_per_iteration": 2.93152117729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.06395674, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.05937288472032104, + "language_loss": 0.82798421, + "learning_rate": 0.000490965764650776, + "loss": 0.83892947, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.30541992, + "step": 2706, + "time_per_iteration": 2.914069414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090504, + "balance_loss_mlp": 1.06048679, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.08994605713309432, + "language_loss": 0.82582623, + "learning_rate": 0.0004906542749733798, + "loss": 0.83673131, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.29980469, + "step": 2707, + "time_per_iteration": 3.632612943649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.05647707, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.05099864574791971, + "language_loss": 0.85112798, + "learning_rate": 0.0004903427889243156, + "loss": 0.86199224, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.29907227, + "step": 2708, + "time_per_iteration": 2.860605001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05898452, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.058285600596581014, + "language_loss": 0.85712206, + "learning_rate": 0.0004900313066245134, + "loss": 0.86801398, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.30151367, + "step": 2709, + "time_per_iteration": 2.6910862922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078824, + "balance_loss_mlp": 1.04873538, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.06298998318770882, + "language_loss": 0.81023324, + "learning_rate": 0.0004897198281949012, + "loss": 0.8210215, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.30029297, + "step": 2710, + "time_per_iteration": 2.660783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085709, + "balance_loss_mlp": 1.0563364, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.06559869836216795, + "language_loss": 0.77832824, + "learning_rate": 0.0004894083537564057, + "loss": 0.78918535, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.29345703, + "step": 2711, + "time_per_iteration": 2.7276909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079715, + "balance_loss_mlp": 1.04965043, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.0684248274147048, + "language_loss": 0.80827081, + "learning_rate": 0.0004890968834299519, + "loss": 0.81906796, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.30029297, + "step": 2712, + "time_per_iteration": 2.738229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.04974508, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.061787257592987296, + "language_loss": 0.78808606, + "learning_rate": 0.0004887854173364633, + "loss": 0.79888272, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.29882812, + "step": 2713, + "time_per_iteration": 2.734443426132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074151, + "balance_loss_mlp": 1.04480171, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.05102910961180143, + "language_loss": 0.81491256, + "learning_rate": 0.0004884739555968617, + "loss": 0.82565403, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.29272461, + "step": 2714, + "time_per_iteration": 2.867036819458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.05559933, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.021468860083039186, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80046767, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.14160156, + "step": 2715, + "time_per_iteration": 4.962530851364136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04559731, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.06298546380073215, + "language_loss": 0.86646473, + "learning_rate": 0.0004878510456629992, + "loss": 0.87722689, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.30566406, + "step": 2716, + "time_per_iteration": 2.9603123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081784, + "balance_loss_mlp": 1.05110002, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.07025764068668285, + "language_loss": 0.85336471, + "learning_rate": 0.00048753959771057314, + "loss": 0.86418259, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.30639648, + "step": 2717, + "time_per_iteration": 2.632622480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_mlp": 1.05389357, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.05729998182106491, + "language_loss": 0.82715809, + "learning_rate": 0.0004872281545957044, + "loss": 0.83801079, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.31347656, + "step": 2718, + "time_per_iteration": 2.7305338382720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078735, + "balance_loss_mlp": 1.04726386, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.058019575066879846, + "language_loss": 0.86264348, + "learning_rate": 0.0004869167164393055, + "loss": 0.87343085, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.31445312, + "step": 2719, + "time_per_iteration": 2.9418067932128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_mlp": 1.04472566, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.0640312473735956, + "language_loss": 0.89536262, + "learning_rate": 0.00048660528336228793, + "loss": 0.90611863, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.30834961, + "step": 2720, + "time_per_iteration": 2.8314764499664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04506063, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.05104764752581424, + "language_loss": 0.89906192, + "learning_rate": 0.0004862938554855606, + "loss": 0.90981793, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.30517578, + "step": 2721, + "time_per_iteration": 2.7912685871124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.04705238, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.09225462001304952, + "language_loss": 0.86140561, + "learning_rate": 0.0004859824329300304, + "loss": 0.87217844, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.30200195, + "step": 2722, + "time_per_iteration": 2.5850255489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081058, + "balance_loss_mlp": 1.0504688, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.05217438950511115, + "language_loss": 0.83504456, + "learning_rate": 0.00048567101581660244, + "loss": 0.84585512, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.30541992, + "step": 2723, + "time_per_iteration": 2.6090264320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.04712343, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.07777816613104971, + "language_loss": 0.8713702, + "learning_rate": 0.00048535960426617956, + "loss": 0.88215029, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.30834961, + "step": 2724, + "time_per_iteration": 2.6143879890441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079989, + "balance_loss_mlp": 1.04966187, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.061907794652793086, + "language_loss": 0.81729943, + "learning_rate": 0.0004850481983996621, + "loss": 0.82809931, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.30273438, + "step": 2725, + "time_per_iteration": 2.7439112663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.05174541, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.06296520541747418, + "language_loss": 0.87762207, + "learning_rate": 0.0004847367983379492, + "loss": 0.88844043, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.30053711, + "step": 2726, + "time_per_iteration": 2.497286796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080055, + "balance_loss_mlp": 1.05056226, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.09099502950257793, + "language_loss": 0.78826892, + "learning_rate": 0.00048442540420193643, + "loss": 0.79906946, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.29418945, + "step": 2727, + "time_per_iteration": 2.9191126823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077698, + "balance_loss_mlp": 1.04751396, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.061166777448516674, + "language_loss": 0.79150236, + "learning_rate": 0.0004841140161125182, + "loss": 0.80227935, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.30126953, + "step": 2728, + "time_per_iteration": 3.5845582485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082892, + "balance_loss_mlp": 1.05306578, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.06421237850995067, + "language_loss": 0.84691751, + "learning_rate": 0.0004838026341905857, + "loss": 0.85774648, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.29785156, + "step": 2729, + "time_per_iteration": 2.75872540473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.05010509, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.051610102750965434, + "language_loss": 0.85352898, + "learning_rate": 0.00048349125855702844, + "loss": 0.86433375, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.30322266, + "step": 2730, + "time_per_iteration": 2.7679519653320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108307, + "balance_loss_mlp": 1.05322015, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.05904184367240025, + "language_loss": 0.81296933, + "learning_rate": 0.00048317988933273287, + "loss": 0.82380003, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.29785156, + "step": 2731, + "time_per_iteration": 2.7559163570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079843, + "balance_loss_mlp": 1.0495404, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.06321650060381495, + "language_loss": 0.8227402, + "learning_rate": 0.00048286852663858367, + "loss": 0.83353865, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.30273438, + "step": 2732, + "time_per_iteration": 2.9430267810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077146, + "balance_loss_mlp": 1.04710531, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.05929618739033729, + "language_loss": 0.84009433, + "learning_rate": 0.000482557170595462, + "loss": 0.85086572, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.30004883, + "step": 2733, + "time_per_iteration": 2.914397954940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.05194473, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.05379595829627383, + "language_loss": 0.87649244, + "learning_rate": 0.0004822458213242475, + "loss": 0.88732612, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31396484, + "step": 2734, + "time_per_iteration": 2.533350944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082101, + "balance_loss_mlp": 1.05215609, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.15308762813128413, + "language_loss": 0.85928154, + "learning_rate": 0.00048193447894581627, + "loss": 0.87010252, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.29882812, + "step": 2735, + "time_per_iteration": 3.0971109867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081636, + "balance_loss_mlp": 1.05190539, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.059512944610192846, + "language_loss": 0.88020355, + "learning_rate": 0.00048162314358104243, + "loss": 0.89101994, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.296875, + "step": 2736, + "time_per_iteration": 2.619262456893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.05268502, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.05996263826740056, + "language_loss": 0.83247852, + "learning_rate": 0.0004813118153507969, + "loss": 0.84329623, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.29052734, + "step": 2737, + "time_per_iteration": 2.724499464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.06603909, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.02099488410784391, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83527088, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13964844, + "step": 2738, + "time_per_iteration": 4.7655651569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109097, + "balance_loss_mlp": 1.06135821, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.054521404688675106, + "language_loss": 0.83406657, + "learning_rate": 0.00048068918077736163, + "loss": 0.84497625, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.29541016, + "step": 2739, + "time_per_iteration": 3.2117719650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_mlp": 1.05820239, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.06027403163408104, + "language_loss": 0.81200749, + "learning_rate": 0.0004803778746759001, + "loss": 0.82288492, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.29492188, + "step": 2740, + "time_per_iteration": 2.883953809738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085865, + "balance_loss_mlp": 1.05627775, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.07072803117785999, + "language_loss": 0.81773007, + "learning_rate": 0.00048006657619242317, + "loss": 0.82858872, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.29541016, + "step": 2741, + "time_per_iteration": 2.6289987564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108813, + "balance_loss_mlp": 1.05959105, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.07275993710061575, + "language_loss": 0.78293514, + "learning_rate": 0.00047975528544778775, + "loss": 0.79381645, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.28491211, + "step": 2742, + "time_per_iteration": 2.6370468139648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_mlp": 1.05685973, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.08133754904485412, + "language_loss": 0.88532221, + "learning_rate": 0.00047944400256284754, + "loss": 0.89617908, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.28808594, + "step": 2743, + "time_per_iteration": 2.6988437175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05504286, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.061354637447893066, + "language_loss": 0.8008759, + "learning_rate": 0.0004791327276584532, + "loss": 0.81171608, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.28930664, + "step": 2744, + "time_per_iteration": 2.843850612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092207, + "balance_loss_mlp": 1.0627383, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.06451817982099761, + "language_loss": 0.80512536, + "learning_rate": 0.00047882146085545264, + "loss": 0.81604743, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.29418945, + "step": 2745, + "time_per_iteration": 2.6313765048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059727, + "balance_loss_mlp": 1.04713857, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.01846816151842821, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76462114, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12597656, + "step": 2746, + "time_per_iteration": 4.961829662322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080481, + "balance_loss_mlp": 1.05105972, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.06475941859576588, + "language_loss": 0.79224515, + "learning_rate": 0.00047819895203700684, + "loss": 0.80304992, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29394531, + "step": 2747, + "time_per_iteration": 2.727640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_mlp": 1.03618371, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.01378573653182101, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76561111, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.70350980758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074595, + "balance_loss_mlp": 1.04469705, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.06074589131451646, + "language_loss": 0.88260013, + "learning_rate": 0.0004775764770742277, + "loss": 0.89334607, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29907227, + "step": 2749, + "time_per_iteration": 2.8722305297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.05064785, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.1215004440050613, + "language_loss": 0.86453164, + "learning_rate": 0.00047726525259079777, + "loss": 0.8753407, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.30224609, + "step": 2750, + "time_per_iteration": 2.782618522644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082004, + "balance_loss_mlp": 1.05203521, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.07030365944612293, + "language_loss": 0.88707, + "learning_rate": 0.0004769540369337798, + "loss": 0.89789003, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.29931641, + "step": 2751, + "time_per_iteration": 2.7570507526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078279, + "balance_loss_mlp": 1.04792809, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.06134745452443849, + "language_loss": 0.86018121, + "learning_rate": 0.00047664283022399794, + "loss": 0.87096399, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.3034668, + "step": 2752, + "time_per_iteration": 2.8683836460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070772, + "balance_loss_mlp": 1.04101765, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.061305381303338104, + "language_loss": 0.80927074, + "learning_rate": 0.00047633163258227376, + "loss": 0.81997848, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.29711914, + "step": 2753, + "time_per_iteration": 2.889761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080468, + "balance_loss_mlp": 1.05040383, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.06040690928097006, + "language_loss": 0.85472161, + "learning_rate": 0.0004760204441294247, + "loss": 0.86552632, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.30004883, + "step": 2754, + "time_per_iteration": 2.7022712230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078457, + "balance_loss_mlp": 1.04736757, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.08887078297019954, + "language_loss": 0.85966748, + "learning_rate": 0.00047570926498626486, + "loss": 0.87045205, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31054688, + "step": 2755, + "time_per_iteration": 2.694779396057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083154, + "balance_loss_mlp": 1.05130148, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0527518505260492, + "language_loss": 0.8147307, + "learning_rate": 0.00047539809527360474, + "loss": 0.82556224, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31835938, + "step": 2756, + "time_per_iteration": 2.8726418018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086344, + "balance_loss_mlp": 1.05418181, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.05719732969355854, + "language_loss": 0.82233423, + "learning_rate": 0.0004750869351122511, + "loss": 0.83319771, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.32128906, + "step": 2757, + "time_per_iteration": 2.989522933959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086301, + "balance_loss_mlp": 1.05397129, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0731965335963944, + "language_loss": 0.81977046, + "learning_rate": 0.00047477578462300685, + "loss": 0.83063352, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.32324219, + "step": 2758, + "time_per_iteration": 2.7154197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108253, + "balance_loss_mlp": 1.05153537, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.05716072116198451, + "language_loss": 0.79401624, + "learning_rate": 0.0004744646439266718, + "loss": 0.80484152, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.30957031, + "step": 2759, + "time_per_iteration": 3.010188102722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087952, + "balance_loss_mlp": 1.05719638, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.06513852008932475, + "language_loss": 0.92120409, + "learning_rate": 0.000474153513144041, + "loss": 0.93208361, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.30712891, + "step": 2760, + "time_per_iteration": 2.9100866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090471, + "balance_loss_mlp": 1.05878544, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.05916855301127547, + "language_loss": 0.8678081, + "learning_rate": 0.00047384239239590633, + "loss": 0.87871277, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.31665039, + "step": 2761, + "time_per_iteration": 2.8746495246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108692, + "balance_loss_mlp": 1.05516267, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.06020342742423831, + "language_loss": 0.88611233, + "learning_rate": 0.0004735312818030556, + "loss": 0.8969816, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.31738281, + "step": 2762, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092394, + "balance_loss_mlp": 1.06101847, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.05825845223399112, + "language_loss": 0.82783639, + "learning_rate": 0.0004732201814862727, + "loss": 0.83876032, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31347656, + "step": 2763, + "time_per_iteration": 2.7706046104431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05740237, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.056446972258987926, + "language_loss": 0.81703943, + "learning_rate": 0.0004729090915663373, + "loss": 0.82791865, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.3046875, + "step": 2764, + "time_per_iteration": 2.8320751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_mlp": 1.0584892, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06421691072563727, + "language_loss": 0.85022444, + "learning_rate": 0.00047259801216402534, + "loss": 0.86110902, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.29931641, + "step": 2765, + "time_per_iteration": 2.5070557594299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087661, + "balance_loss_mlp": 1.05735779, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06743519703895742, + "language_loss": 0.86185229, + "learning_rate": 0.00047228694340010845, + "loss": 0.87272882, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.30249023, + "step": 2766, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089224, + "balance_loss_mlp": 1.05918312, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.057283919540088275, + "language_loss": 0.85907435, + "learning_rate": 0.0004719758853953544, + "loss": 0.86996663, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.29980469, + "step": 2767, + "time_per_iteration": 3.598590850830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093331, + "balance_loss_mlp": 1.06419635, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.07956086058885692, + "language_loss": 0.83881301, + "learning_rate": 0.00047166483827052645, + "loss": 0.84974635, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.29125977, + "step": 2768, + "time_per_iteration": 2.4224319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105739, + "balance_loss_mlp": 1.04441977, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.033276153146473426, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78135878, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.12988281, + "step": 2769, + "time_per_iteration": 4.992494583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05961394, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.06372002073291465, + "language_loss": 0.8365072, + "learning_rate": 0.000471042777143682, + "loss": 0.84740394, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.30029297, + "step": 2770, + "time_per_iteration": 3.214010715484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091808, + "balance_loss_mlp": 1.06255412, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.05770492360265134, + "language_loss": 0.79306901, + "learning_rate": 0.0004707317633831707, + "loss": 0.80398703, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.29223633, + "step": 2771, + "time_per_iteration": 2.5814082622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090013, + "balance_loss_mlp": 1.06035328, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.06429055642690477, + "language_loss": 0.78255731, + "learning_rate": 0.00047042076098559673, + "loss": 0.79345745, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.29614258, + "step": 2772, + "time_per_iteration": 2.626574754714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096839, + "balance_loss_mlp": 1.06763303, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.06567346515998468, + "language_loss": 0.73814428, + "learning_rate": 0.00047010977007170174, + "loss": 0.74911261, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.29150391, + "step": 2773, + "time_per_iteration": 3.2639098167419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_mlp": 1.06039929, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06353427502994992, + "language_loss": 0.82705283, + "learning_rate": 0.00046979879076222334, + "loss": 0.83795249, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.29516602, + "step": 2774, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.0655148, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.051161955256212054, + "language_loss": 0.84535086, + "learning_rate": 0.0004694878231778939, + "loss": 0.8562938, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.28759766, + "step": 2775, + "time_per_iteration": 3.37555193901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094093, + "balance_loss_mlp": 1.06471944, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.05222814179658164, + "language_loss": 0.8401432, + "learning_rate": 0.0004691768674394423, + "loss": 0.85108411, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.29321289, + "step": 2776, + "time_per_iteration": 2.992685317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_mlp": 1.01251328, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.010305238226800423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85508353, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.11816406, + "step": 2777, + "time_per_iteration": 4.753941059112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021329, + "balance_loss_mlp": 1.00950325, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.008050007723784799, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77674866, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.11816406, + "step": 2778, + "time_per_iteration": 4.980912923812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.0625428, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.05741424367086941, + "language_loss": 0.79571807, + "learning_rate": 0.00046824407250656676, + "loss": 0.80663168, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.28808594, + "step": 2779, + "time_per_iteration": 2.641680955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109255, + "balance_loss_mlp": 1.06303382, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.05780417685778494, + "language_loss": 0.83320916, + "learning_rate": 0.0004679331653588161, + "loss": 0.84413469, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.29467773, + "step": 2780, + "time_per_iteration": 2.6292784214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_mlp": 1.05741477, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07200473336731207, + "language_loss": 0.8539027, + "learning_rate": 0.0004676222706605147, + "loss": 0.86477172, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.29467773, + "step": 2781, + "time_per_iteration": 2.633302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082924, + "balance_loss_mlp": 1.05355036, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.06052388593462891, + "language_loss": 0.85071301, + "learning_rate": 0.0004673113885323626, + "loss": 0.86154234, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.29321289, + "step": 2782, + "time_per_iteration": 2.8385848999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108118, + "balance_loss_mlp": 1.05152082, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04759682065371887, + "language_loss": 0.78464407, + "learning_rate": 0.00046700051909505494, + "loss": 0.79545587, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.29638672, + "step": 2783, + "time_per_iteration": 3.17055344581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087683, + "balance_loss_mlp": 1.05730867, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06917760310735488, + "language_loss": 0.83446693, + "learning_rate": 0.000466689662469282, + "loss": 0.84534377, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.3034668, + "step": 2784, + "time_per_iteration": 2.6696882247924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080736, + "balance_loss_mlp": 1.05048084, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.0647182284961505, + "language_loss": 0.84010589, + "learning_rate": 0.00046637881877572917, + "loss": 0.85091329, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.30200195, + "step": 2785, + "time_per_iteration": 3.0897059440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107764, + "balance_loss_mlp": 1.04783738, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.2060352755327757, + "language_loss": 0.84354532, + "learning_rate": 0.0004660679881350764, + "loss": 0.85432178, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.29736328, + "step": 2786, + "time_per_iteration": 2.763195753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_mlp": 1.0236131, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.018061436986608354, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76645112, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.13378906, + "step": 2787, + "time_per_iteration": 5.074235677719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.05223989, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0731464482403051, + "language_loss": 0.77922016, + "learning_rate": 0.0004654463664951667, + "loss": 0.79004586, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.30273438, + "step": 2788, + "time_per_iteration": 2.9973762035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086105, + "balance_loss_mlp": 1.05647016, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.06405642217776768, + "language_loss": 0.83215284, + "learning_rate": 0.0004651355757372447, + "loss": 0.84301388, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.2956543, + "step": 2789, + "time_per_iteration": 2.677021026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.05955315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.05726084062519834, + "language_loss": 0.85958302, + "learning_rate": 0.00046482479851489274, + "loss": 0.87048161, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.30273438, + "step": 2790, + "time_per_iteration": 2.6652121543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.05933237, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.07271669587233448, + "language_loss": 0.77731752, + "learning_rate": 0.00046451403494876525, + "loss": 0.78821647, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.30541992, + "step": 2791, + "time_per_iteration": 2.897798776626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090037, + "balance_loss_mlp": 1.05882847, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.06591879115648011, + "language_loss": 0.84175646, + "learning_rate": 0.0004642032851595111, + "loss": 0.8526569, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.31176758, + "step": 2792, + "time_per_iteration": 2.758230209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086262, + "balance_loss_mlp": 1.05543458, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05973481987913333, + "language_loss": 0.84753001, + "learning_rate": 0.00046389254926777404, + "loss": 0.8583926, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.30810547, + "step": 2793, + "time_per_iteration": 2.7933902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086495, + "balance_loss_mlp": 1.05562031, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05136203618868989, + "language_loss": 0.7824527, + "learning_rate": 0.0004635818273941926, + "loss": 0.79331762, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.30859375, + "step": 2794, + "time_per_iteration": 3.564011335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088501, + "balance_loss_mlp": 1.05786383, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.06685314707582615, + "language_loss": 0.81738025, + "learning_rate": 0.0004632711196593997, + "loss": 0.82826525, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.30639648, + "step": 2795, + "time_per_iteration": 2.7609026432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089037, + "balance_loss_mlp": 1.05882931, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.06695327911218095, + "language_loss": 0.85338485, + "learning_rate": 0.00046296042618402297, + "loss": 0.86427522, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.30175781, + "step": 2796, + "time_per_iteration": 3.079580783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.05344939, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.05461778050704968, + "language_loss": 0.79521048, + "learning_rate": 0.0004626497470886839, + "loss": 0.80605042, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30517578, + "step": 2797, + "time_per_iteration": 2.956915855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_mlp": 1.0549171, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.05348634251654363, + "language_loss": 0.81572765, + "learning_rate": 0.00046233908249399897, + "loss": 0.82658887, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.31176758, + "step": 2798, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087806, + "balance_loss_mlp": 1.05781281, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.07296004689367808, + "language_loss": 0.78106725, + "learning_rate": 0.00046202843252057905, + "loss": 0.79194534, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.29956055, + "step": 2799, + "time_per_iteration": 2.615086317062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.05522037, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.056459019467486986, + "language_loss": 0.83738667, + "learning_rate": 0.00046171779728902896, + "loss": 0.84824288, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.3034668, + "step": 2800, + "time_per_iteration": 2.613084077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05025029, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.07411133953793157, + "language_loss": 0.86239338, + "learning_rate": 0.000461407176919948, + "loss": 0.87320936, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.31323242, + "step": 2801, + "time_per_iteration": 2.5331709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078309, + "balance_loss_mlp": 1.04838777, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.07244428600451569, + "language_loss": 0.85469061, + "learning_rate": 0.00046109657153392997, + "loss": 0.86547375, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.29858398, + "step": 2802, + "time_per_iteration": 2.7376809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081766, + "balance_loss_mlp": 1.05007982, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.06487466420670769, + "language_loss": 0.82949483, + "learning_rate": 0.0004607859812515622, + "loss": 0.84031248, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.31665039, + "step": 2803, + "time_per_iteration": 2.601752996444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078317, + "balance_loss_mlp": 1.0476799, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06325281802882306, + "language_loss": 0.87643886, + "learning_rate": 0.00046047540619342667, + "loss": 0.88722193, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.3059082, + "step": 2804, + "time_per_iteration": 2.6036136150360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080625, + "balance_loss_mlp": 1.05056071, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.0581751577303043, + "language_loss": 0.80008459, + "learning_rate": 0.00046016484648009933, + "loss": 0.81089091, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30004883, + "step": 2805, + "time_per_iteration": 2.713219165802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05105305, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.057792621829283776, + "language_loss": 0.80917501, + "learning_rate": 0.0004598543022321501, + "loss": 0.81997907, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.29296875, + "step": 2806, + "time_per_iteration": 2.631939172744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082616, + "balance_loss_mlp": 1.05281353, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.07612886672081497, + "language_loss": 0.79604518, + "learning_rate": 0.0004595437735701433, + "loss": 0.80687129, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.29736328, + "step": 2807, + "time_per_iteration": 2.701808214187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.0507021, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.07694205416949251, + "language_loss": 0.83500147, + "learning_rate": 0.00045923326061463623, + "loss": 0.84581584, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.30688477, + "step": 2808, + "time_per_iteration": 2.7844398021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078771, + "balance_loss_mlp": 1.04725254, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.07660553916433042, + "language_loss": 0.81710881, + "learning_rate": 0.00045892276348618113, + "loss": 0.82789654, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.31494141, + "step": 2809, + "time_per_iteration": 2.982339859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053757, + "balance_loss_mlp": 1.04088223, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.023591100709610114, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.7931459, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12890625, + "step": 2810, + "time_per_iteration": 5.077887296676636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086772, + "balance_loss_mlp": 1.05580163, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.07053414384060859, + "language_loss": 0.80792511, + "learning_rate": 0.000458301817192603, + "loss": 0.81879282, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.30957031, + "step": 2811, + "time_per_iteration": 2.8369667530059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_mlp": 1.02586305, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.019629272648215536, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81880522, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12890625, + "step": 2812, + "time_per_iteration": 4.8166663646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079133, + "balance_loss_mlp": 1.04790044, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.05474211885389724, + "language_loss": 0.86781704, + "learning_rate": 0.00045768093565369983, + "loss": 0.87860835, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31201172, + "step": 2813, + "time_per_iteration": 2.7311370372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081245, + "balance_loss_mlp": 1.05077481, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05950457911446913, + "language_loss": 0.8158434, + "learning_rate": 0.0004573705194685646, + "loss": 0.82665586, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.30444336, + "step": 2814, + "time_per_iteration": 2.733198404312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_mlp": 1.0498848, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.06917969261153488, + "language_loss": 0.84880143, + "learning_rate": 0.00045706011983366157, + "loss": 0.85961473, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.31420898, + "step": 2815, + "time_per_iteration": 2.6939895153045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.04683733, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.08149095023345422, + "language_loss": 0.82716835, + "learning_rate": 0.00045674973686949847, + "loss": 0.83794552, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.30834961, + "step": 2816, + "time_per_iteration": 2.532838821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.045784, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.06493873134640445, + "language_loss": 0.85336345, + "learning_rate": 0.0004564393706965766, + "loss": 0.86413169, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 3.013608455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.04578137, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.06666383117391396, + "language_loss": 0.81068963, + "learning_rate": 0.00045612902143539116, + "loss": 0.82146215, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31469727, + "step": 2818, + "time_per_iteration": 2.605372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070647, + "balance_loss_mlp": 1.03998637, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.07813750406706815, + "language_loss": 0.81324685, + "learning_rate": 0.00045581868920642986, + "loss": 0.82395327, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.30615234, + "step": 2819, + "time_per_iteration": 2.4960100650787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.04709649, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.07920473504276467, + "language_loss": 0.79243749, + "learning_rate": 0.00045550837413017457, + "loss": 0.80321598, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30712891, + "step": 2820, + "time_per_iteration": 2.684987783432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072493, + "balance_loss_mlp": 1.04188037, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.056801171387635116, + "language_loss": 0.85060829, + "learning_rate": 0.0004551980763271005, + "loss": 0.86133325, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30566406, + "step": 2821, + "time_per_iteration": 2.6912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075835, + "balance_loss_mlp": 1.04529333, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.05882616642734503, + "language_loss": 0.83789319, + "learning_rate": 0.0004548877959176756, + "loss": 0.84865159, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.30493164, + "step": 2822, + "time_per_iteration": 2.8441174030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080776, + "balance_loss_mlp": 1.04985332, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.06945933761570218, + "language_loss": 0.86118329, + "learning_rate": 0.00045457753302236166, + "loss": 0.8719911, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30908203, + "step": 2823, + "time_per_iteration": 2.6186442375183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107393, + "balance_loss_mlp": 1.04312599, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.07165023342281863, + "language_loss": 0.87164384, + "learning_rate": 0.00045426728776161353, + "loss": 0.88238311, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30761719, + "step": 2824, + "time_per_iteration": 2.7953178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.05092704, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.05974352124313591, + "language_loss": 0.81803101, + "learning_rate": 0.00045395706025587863, + "loss": 0.8288421, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.30151367, + "step": 2825, + "time_per_iteration": 2.612980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076561, + "balance_loss_mlp": 1.04599547, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.07443979134593931, + "language_loss": 0.8264693, + "learning_rate": 0.00045364685062559843, + "loss": 0.83723497, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30541992, + "step": 2826, + "time_per_iteration": 2.828479051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04630804, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.061142502150282975, + "language_loss": 0.91168308, + "learning_rate": 0.0004533366589912067, + "loss": 0.92245257, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.30615234, + "step": 2827, + "time_per_iteration": 2.970296621322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075368, + "balance_loss_mlp": 1.04599524, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.07414497131093437, + "language_loss": 0.77502602, + "learning_rate": 0.0004530264854731306, + "loss": 0.78577971, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29370117, + "step": 2828, + "time_per_iteration": 3.022944450378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05521488, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.048879345895653556, + "language_loss": 0.84054667, + "learning_rate": 0.00045271633019179034, + "loss": 0.85139751, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.29833984, + "step": 2829, + "time_per_iteration": 2.7760679721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086373, + "balance_loss_mlp": 1.05707121, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.06402410848819869, + "language_loss": 0.87688053, + "learning_rate": 0.0004524061932675986, + "loss": 0.88774425, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.29248047, + "step": 2830, + "time_per_iteration": 2.830350637435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_mlp": 1.05691731, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.06453180665575306, + "language_loss": 0.86766136, + "learning_rate": 0.00045209607482096125, + "loss": 0.87853098, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.30029297, + "step": 2831, + "time_per_iteration": 3.0085608959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082113, + "balance_loss_mlp": 1.05192947, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.06460698711812493, + "language_loss": 0.84066617, + "learning_rate": 0.0004517859749722772, + "loss": 0.85148734, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.30126953, + "step": 2832, + "time_per_iteration": 2.6471612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.04803348, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.09569427913676506, + "language_loss": 0.78785688, + "learning_rate": 0.0004514758938419376, + "loss": 0.79863977, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.30200195, + "step": 2833, + "time_per_iteration": 2.8068594932556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_mlp": 1.02627981, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.016706116470577157, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77958739, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.11865234, + "step": 2834, + "time_per_iteration": 4.907236814498901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.04871142, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.06561437539450005, + "language_loss": 0.83799005, + "learning_rate": 0.00045085578821782175, + "loss": 0.84878516, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.30761719, + "step": 2835, + "time_per_iteration": 2.538837194442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_mlp": 1.02082336, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.016611239115941395, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77167535, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.11962891, + "step": 2836, + "time_per_iteration": 4.947264671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.04765117, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.05618000101860937, + "language_loss": 0.8099249, + "learning_rate": 0.00045023575891159866, + "loss": 0.82071036, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30859375, + "step": 2837, + "time_per_iteration": 2.7390823364257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_mlp": 1.01348448, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.010465474292049673, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75789356, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.12060547, + "step": 2838, + "time_per_iteration": 4.913767576217651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_mlp": 1.05025697, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.053509390521789255, + "language_loss": 0.78084177, + "learning_rate": 0.0004496158068861354, + "loss": 0.7916435, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29882812, + "step": 2839, + "time_per_iteration": 2.816080331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_mlp": 1.05548143, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.05135655646470402, + "language_loss": 0.80302298, + "learning_rate": 0.00044930586015455207, + "loss": 0.81387937, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.30102539, + "step": 2840, + "time_per_iteration": 2.79626727104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087336, + "balance_loss_mlp": 1.05717611, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.05566707414242676, + "language_loss": 0.89057064, + "learning_rate": 0.000448995933104179, + "loss": 0.90144402, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.30102539, + "step": 2841, + "time_per_iteration": 2.8602969646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080566, + "balance_loss_mlp": 1.0502634, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.07080900039808569, + "language_loss": 0.80240697, + "learning_rate": 0.00044868602585534077, + "loss": 0.81321263, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.30297852, + "step": 2842, + "time_per_iteration": 2.9035747051239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078755, + "balance_loss_mlp": 1.04778409, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.061738359719804514, + "language_loss": 0.88582397, + "learning_rate": 0.0004483761385283541, + "loss": 0.89661151, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.30932617, + "step": 2843, + "time_per_iteration": 2.5193030834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074267, + "balance_loss_mlp": 1.04448807, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05447472334615201, + "language_loss": 0.81464523, + "learning_rate": 0.0004480662712435281, + "loss": 0.8253879, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.29736328, + "step": 2844, + "time_per_iteration": 2.731069326400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107206, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.060615817798691185, + "language_loss": 0.8824929, + "learning_rate": 0.0004477564241211635, + "loss": 0.89321351, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.29467773, + "step": 2845, + "time_per_iteration": 2.5875682830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079224, + "balance_loss_mlp": 1.04880142, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.0822753996114188, + "language_loss": 0.86914051, + "learning_rate": 0.0004474465972815541, + "loss": 0.87993276, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.30371094, + "step": 2846, + "time_per_iteration": 2.4777207374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074275, + "balance_loss_mlp": 1.04406786, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.05432348028770475, + "language_loss": 0.87747157, + "learning_rate": 0.000447136790844985, + "loss": 0.88821435, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.30151367, + "step": 2847, + "time_per_iteration": 2.6856186389923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04623675, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.055626256163384374, + "language_loss": 0.81023288, + "learning_rate": 0.00044682700493173385, + "loss": 0.8210023, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.30664062, + "step": 2848, + "time_per_iteration": 2.8167617321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082333, + "balance_loss_mlp": 1.05229259, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.06111415202222153, + "language_loss": 0.80075896, + "learning_rate": 0.00044651723966207004, + "loss": 0.81158233, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.29980469, + "step": 2849, + "time_per_iteration": 3.0959999561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084207, + "balance_loss_mlp": 1.05435705, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.05903862339795778, + "language_loss": 0.78441715, + "learning_rate": 0.00044620749515625536, + "loss": 0.79525924, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.2980957, + "step": 2850, + "time_per_iteration": 2.7892706394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.05001831, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.0673362889441577, + "language_loss": 0.84918725, + "learning_rate": 0.00044589777153454334, + "loss": 0.85998976, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30175781, + "step": 2851, + "time_per_iteration": 2.771003007888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083219, + "balance_loss_mlp": 1.05241561, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05413608872240749, + "language_loss": 0.83428276, + "learning_rate": 0.00044558806891717895, + "loss": 0.84511489, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30761719, + "step": 2852, + "time_per_iteration": 2.499460220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088115, + "balance_loss_mlp": 1.0584085, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.06786065051926819, + "language_loss": 0.79808474, + "learning_rate": 0.0004452783874243998, + "loss": 0.80896592, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.29663086, + "step": 2853, + "time_per_iteration": 2.8307228088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084659, + "balance_loss_mlp": 1.05497599, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06292410009946192, + "language_loss": 0.84795368, + "learning_rate": 0.00044496872717643475, + "loss": 0.85880023, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.29638672, + "step": 2854, + "time_per_iteration": 2.6626110076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.03819215, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.03322747605543158, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78140646, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.13183594, + "step": 2855, + "time_per_iteration": 4.957303285598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.05448246, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.04982994122271322, + "language_loss": 0.81768692, + "learning_rate": 0.0004443494708958217, + "loss": 0.82852638, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.29443359, + "step": 2856, + "time_per_iteration": 3.005343437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088352, + "balance_loss_mlp": 1.0585736, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.04689474861444355, + "language_loss": 0.80522525, + "learning_rate": 0.0004440398751035906, + "loss": 0.8161087, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29736328, + "step": 2857, + "time_per_iteration": 2.868595838546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095367, + "balance_loss_mlp": 1.06659007, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07030492887566664, + "language_loss": 0.83409548, + "learning_rate": 0.00044373030103700645, + "loss": 0.8450492, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.28759766, + "step": 2858, + "time_per_iteration": 2.5910122394561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094102, + "balance_loss_mlp": 1.06508696, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.06946154028242445, + "language_loss": 0.79413795, + "learning_rate": 0.000443420748816257, + "loss": 0.80507904, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28979492, + "step": 2859, + "time_per_iteration": 2.825594663619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06706619, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.06600867884275338, + "language_loss": 0.78576386, + "learning_rate": 0.0004431112185615208, + "loss": 0.79672724, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.29248047, + "step": 2860, + "time_per_iteration": 2.786670446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090723, + "balance_loss_mlp": 1.06154037, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.06889565209263777, + "language_loss": 0.79788846, + "learning_rate": 0.00044280171039296845, + "loss": 0.80879569, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29174805, + "step": 2861, + "time_per_iteration": 2.634674072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.0620054, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.05438680375258401, + "language_loss": 0.88480103, + "learning_rate": 0.0004424922244307616, + "loss": 0.89570987, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.28857422, + "step": 2862, + "time_per_iteration": 2.6849331855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093044, + "balance_loss_mlp": 1.06328964, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06984640427248112, + "language_loss": 0.81865609, + "learning_rate": 0.00044218276079505315, + "loss": 0.82958651, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.29711914, + "step": 2863, + "time_per_iteration": 2.9186837673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.06289792, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.06524866768544495, + "language_loss": 0.74926496, + "learning_rate": 0.0004418733196059876, + "loss": 0.76019078, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29663086, + "step": 2864, + "time_per_iteration": 2.74560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_mlp": 1.05635333, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.056184402553186, + "language_loss": 0.79785758, + "learning_rate": 0.0004415639009837008, + "loss": 0.80870748, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28637695, + "step": 2865, + "time_per_iteration": 2.81969952583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087597, + "balance_loss_mlp": 1.05908251, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.061494004909324176, + "language_loss": 0.81620675, + "learning_rate": 0.00044125450504831955, + "loss": 0.82708275, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.28540039, + "step": 2866, + "time_per_iteration": 2.739954948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085385, + "balance_loss_mlp": 1.05586863, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.07127737838687996, + "language_loss": 0.81880403, + "learning_rate": 0.0004409451319199622, + "loss": 0.82965791, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.29467773, + "step": 2867, + "time_per_iteration": 2.6776282787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.0484705, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.06535442843844029, + "language_loss": 0.84516299, + "learning_rate": 0.0004406357817187381, + "loss": 0.85593313, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.28540039, + "step": 2868, + "time_per_iteration": 3.002542495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05170417, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.05667738365358171, + "language_loss": 0.81411439, + "learning_rate": 0.0004403264545647474, + "loss": 0.82492542, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29370117, + "step": 2869, + "time_per_iteration": 3.523195505142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080839, + "balance_loss_mlp": 1.05196702, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.062383704003679354, + "language_loss": 0.8429901, + "learning_rate": 0.00044001715057808154, + "loss": 0.85379851, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.28808594, + "step": 2870, + "time_per_iteration": 2.759244680404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_mlp": 1.05496836, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05408626919612749, + "language_loss": 0.81631571, + "learning_rate": 0.0004397078698788232, + "loss": 0.82716751, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.30175781, + "step": 2871, + "time_per_iteration": 3.2238638401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_mlp": 1.0167197, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.017765030651381717, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81471765, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12695312, + "step": 2872, + "time_per_iteration": 4.941680431365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084518, + "balance_loss_mlp": 1.05442953, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06021715836391359, + "language_loss": 0.77858603, + "learning_rate": 0.00043908937882281343, + "loss": 0.78943121, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.30029297, + "step": 2873, + "time_per_iteration": 2.6475777626037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_mlp": 1.04845667, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05779342240658392, + "language_loss": 0.82503784, + "learning_rate": 0.0004387801687061814, + "loss": 0.83582854, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.30566406, + "step": 2874, + "time_per_iteration": 2.8554017543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078914, + "balance_loss_mlp": 1.04963589, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.0636526113513214, + "language_loss": 0.80157411, + "learning_rate": 0.0004384709823571958, + "loss": 0.81236321, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.29223633, + "step": 2875, + "time_per_iteration": 2.749535322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076752, + "balance_loss_mlp": 1.04764128, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06015536663517987, + "language_loss": 0.82898968, + "learning_rate": 0.0004381618198958932, + "loss": 0.8397572, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.29052734, + "step": 2876, + "time_per_iteration": 3.518888235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0494318, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05611364502947972, + "language_loss": 0.83295852, + "learning_rate": 0.00043785268144230137, + "loss": 0.84374702, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.29418945, + "step": 2877, + "time_per_iteration": 2.8977479934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078991, + "balance_loss_mlp": 1.04916453, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.07334940017367843, + "language_loss": 0.82020825, + "learning_rate": 0.00043754356711643837, + "loss": 0.83099812, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29785156, + "step": 2878, + "time_per_iteration": 2.6804401874542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080304, + "balance_loss_mlp": 1.04964316, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.0625181232423103, + "language_loss": 0.84172422, + "learning_rate": 0.0004372344770383132, + "loss": 0.85252726, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30615234, + "step": 2879, + "time_per_iteration": 2.80837345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04766345, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.05711228581787917, + "language_loss": 0.82837629, + "learning_rate": 0.00043692541132792507, + "loss": 0.83915067, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29736328, + "step": 2880, + "time_per_iteration": 2.7545833587646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04738569, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.06446598855551679, + "language_loss": 0.83125883, + "learning_rate": 0.00043661637010526384, + "loss": 0.84202665, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.29370117, + "step": 2881, + "time_per_iteration": 2.4907724857330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072171, + "balance_loss_mlp": 1.04139102, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.05841414515956175, + "language_loss": 0.82957321, + "learning_rate": 0.00043630735349031025, + "loss": 0.8402949, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30737305, + "step": 2882, + "time_per_iteration": 2.6922152042388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071624, + "balance_loss_mlp": 1.04101133, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.05422763519754927, + "language_loss": 0.81816816, + "learning_rate": 0.00043599836160303495, + "loss": 0.82888442, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.30566406, + "step": 2883, + "time_per_iteration": 2.861325979232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069587, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05987077775612136, + "language_loss": 0.77311337, + "learning_rate": 0.0004356893945633995, + "loss": 0.78380919, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.30395508, + "step": 2884, + "time_per_iteration": 2.964421510696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070587, + "balance_loss_mlp": 1.03930664, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.16390384373312603, + "language_loss": 0.81600153, + "learning_rate": 0.0004353804524913551, + "loss": 0.82670736, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.3125, + "step": 2885, + "time_per_iteration": 2.6043736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068449, + "balance_loss_mlp": 1.03721642, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.06199045057720987, + "language_loss": 0.81625175, + "learning_rate": 0.0004350715355068441, + "loss": 0.82693619, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.31225586, + "step": 2886, + "time_per_iteration": 2.7229857444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072103, + "balance_loss_mlp": 1.04051256, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06868325666686464, + "language_loss": 0.79814357, + "learning_rate": 0.00043476264372979847, + "loss": 0.80886459, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.31567383, + "step": 2887, + "time_per_iteration": 2.5191705226898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071885, + "balance_loss_mlp": 1.0417012, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.07224884026335429, + "language_loss": 0.78504527, + "learning_rate": 0.0004344537772801408, + "loss": 0.79576409, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.30151367, + "step": 2888, + "time_per_iteration": 3.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_mlp": 1.02040219, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.021049912274883148, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74454963, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12109375, + "step": 2889, + "time_per_iteration": 4.967891216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.04613566, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.06601593716549485, + "language_loss": 0.83441556, + "learning_rate": 0.0004338361208426298, + "loss": 0.84519023, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.31298828, + "step": 2890, + "time_per_iteration": 2.6076786518096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_mlp": 1.0466727, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.05044338716051736, + "language_loss": 0.81248903, + "learning_rate": 0.00043352733109457164, + "loss": 0.82326382, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.30761719, + "step": 2891, + "time_per_iteration": 2.893113136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081411, + "balance_loss_mlp": 1.05148911, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.05185548617134015, + "language_loss": 0.84650671, + "learning_rate": 0.00043321856715349244, + "loss": 0.8573209, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29907227, + "step": 2892, + "time_per_iteration": 2.9470455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05024242, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.060968656189677554, + "language_loss": 0.80153251, + "learning_rate": 0.00043290982913926466, + "loss": 0.81233752, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.30249023, + "step": 2893, + "time_per_iteration": 2.801114559173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.05283189, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.06077441603872835, + "language_loss": 0.83792776, + "learning_rate": 0.0004326011171717514, + "loss": 0.84875673, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30004883, + "step": 2894, + "time_per_iteration": 2.889112710952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077209, + "balance_loss_mlp": 1.04762125, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.06532751979042353, + "language_loss": 0.81112337, + "learning_rate": 0.0004322924313708051, + "loss": 0.82189548, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.29614258, + "step": 2895, + "time_per_iteration": 2.5237138271331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04895401, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.06395509577189365, + "language_loss": 0.84357458, + "learning_rate": 0.0004319837718562681, + "loss": 0.85435069, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.28686523, + "step": 2896, + "time_per_iteration": 2.6235451698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081945, + "balance_loss_mlp": 1.05123627, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.07087835610959153, + "language_loss": 0.82998407, + "learning_rate": 0.0004316751387479726, + "loss": 0.8408035, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30664062, + "step": 2897, + "time_per_iteration": 2.7460193634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081079, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.06734561564060734, + "language_loss": 0.82601708, + "learning_rate": 0.0004313665321657409, + "loss": 0.83682787, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.29882812, + "step": 2898, + "time_per_iteration": 3.700585126876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083979, + "balance_loss_mlp": 1.05393827, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06408348461050545, + "language_loss": 0.79922706, + "learning_rate": 0.00043105795222938436, + "loss": 0.81006682, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.30004883, + "step": 2899, + "time_per_iteration": 2.785468816757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077879, + "balance_loss_mlp": 1.04776657, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.056878366734987945, + "language_loss": 0.78559703, + "learning_rate": 0.00043074939905870467, + "loss": 0.79637581, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.30078125, + "step": 2900, + "time_per_iteration": 2.6782429218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081281, + "balance_loss_mlp": 1.05157411, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.061480860141572814, + "language_loss": 0.806315, + "learning_rate": 0.0004304408727734927, + "loss": 0.81712782, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.296875, + "step": 2901, + "time_per_iteration": 2.6361851692199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089927, + "balance_loss_mlp": 1.05955291, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.045249909626423154, + "language_loss": 0.88812852, + "learning_rate": 0.0004301323734935288, + "loss": 0.89902782, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.3034668, + "step": 2902, + "time_per_iteration": 2.650801181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_mlp": 1.05541265, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.061039385793722846, + "language_loss": 0.87144208, + "learning_rate": 0.000429823901338583, + "loss": 0.88229275, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.29638672, + "step": 2903, + "time_per_iteration": 2.603729486465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108106, + "balance_loss_mlp": 1.05128181, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.060582508535745275, + "language_loss": 0.86712891, + "learning_rate": 0.00042951545642841513, + "loss": 0.87793946, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.29711914, + "step": 2904, + "time_per_iteration": 3.0844316482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05437517, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.055991570648287706, + "language_loss": 0.86597067, + "learning_rate": 0.0004292070388827737, + "loss": 0.87681645, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.30175781, + "step": 2905, + "time_per_iteration": 2.561948537826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082655, + "balance_loss_mlp": 1.0526619, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.06056202554709599, + "language_loss": 0.80913132, + "learning_rate": 0.00042889864882139753, + "loss": 0.81995785, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.29956055, + "step": 2906, + "time_per_iteration": 2.584385871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088672, + "balance_loss_mlp": 1.05913234, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.05654682862292604, + "language_loss": 0.81697655, + "learning_rate": 0.0004285902863640139, + "loss": 0.82786322, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29516602, + "step": 2907, + "time_per_iteration": 2.598034620285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05342221, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.05788374674587666, + "language_loss": 0.85753977, + "learning_rate": 0.00042828195163033966, + "loss": 0.86837995, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.30566406, + "step": 2908, + "time_per_iteration": 2.654411792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.05099869, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.05647224332708591, + "language_loss": 0.79214805, + "learning_rate": 0.0004279736447400812, + "loss": 0.80296183, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30322266, + "step": 2909, + "time_per_iteration": 2.6054940223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05421579, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05245180641385236, + "language_loss": 0.78436708, + "learning_rate": 0.00042766536581293385, + "loss": 0.79521292, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.3034668, + "step": 2910, + "time_per_iteration": 2.735391139984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_mlp": 1.0553261, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.07209314448313818, + "language_loss": 0.79203892, + "learning_rate": 0.0004273571149685819, + "loss": 0.80289924, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30664062, + "step": 2911, + "time_per_iteration": 2.7689387798309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081503, + "balance_loss_mlp": 1.05234432, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.05523073387542819, + "language_loss": 0.8391124, + "learning_rate": 0.00042704889232669937, + "loss": 0.84992743, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29125977, + "step": 2912, + "time_per_iteration": 2.7328362464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082045, + "balance_loss_mlp": 1.05288625, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.0608748772154565, + "language_loss": 0.85180819, + "learning_rate": 0.0004267406980069484, + "loss": 0.8626287, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29150391, + "step": 2913, + "time_per_iteration": 2.6889522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_mlp": 1.05416012, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.0517518520900543, + "language_loss": 0.79621083, + "learning_rate": 0.0004264325321289808, + "loss": 0.80704308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.2902832, + "step": 2914, + "time_per_iteration": 2.7854018211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_mlp": 1.05145359, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.05874282962966631, + "language_loss": 0.86178029, + "learning_rate": 0.00042612439481243736, + "loss": 0.87259024, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.29516602, + "step": 2915, + "time_per_iteration": 2.7484261989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.05264628, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06045457404054478, + "language_loss": 0.89827836, + "learning_rate": 0.00042581628617694735, + "loss": 0.90910184, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.29663086, + "step": 2916, + "time_per_iteration": 2.7450428009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_mlp": 1.05376196, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.06174360046329572, + "language_loss": 0.81716877, + "learning_rate": 0.0004255082063421296, + "loss": 0.82800722, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.30078125, + "step": 2917, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080705, + "balance_loss_mlp": 1.0505209, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.07215647610626674, + "language_loss": 0.85068524, + "learning_rate": 0.00042520015542759065, + "loss": 0.86149234, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.30151367, + "step": 2918, + "time_per_iteration": 2.838871717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083881, + "balance_loss_mlp": 1.05379248, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.06380613116798055, + "language_loss": 0.88105166, + "learning_rate": 0.00042489213355292687, + "loss": 0.89189053, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.30053711, + "step": 2919, + "time_per_iteration": 2.882988214492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081698, + "balance_loss_mlp": 1.0521102, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05903342570268675, + "language_loss": 0.80986512, + "learning_rate": 0.00042458414083772276, + "loss": 0.82068217, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.29541016, + "step": 2920, + "time_per_iteration": 2.520209550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107915, + "balance_loss_mlp": 1.04829907, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.05182413981421792, + "language_loss": 0.85047603, + "learning_rate": 0.000424276177401552, + "loss": 0.86126757, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.30810547, + "step": 2921, + "time_per_iteration": 2.777956008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.04435039, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.05854064719302618, + "language_loss": 0.85700345, + "learning_rate": 0.0004239682433639763, + "loss": 0.86775458, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.30712891, + "step": 2922, + "time_per_iteration": 2.658231019973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074103, + "balance_loss_mlp": 1.04344249, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.07532891292065343, + "language_loss": 0.85277867, + "learning_rate": 0.0004236603388445467, + "loss": 0.86351973, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.30639648, + "step": 2923, + "time_per_iteration": 2.5820417404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073675, + "balance_loss_mlp": 1.04346776, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05777778027932593, + "language_loss": 0.82139969, + "learning_rate": 0.00042335246396280166, + "loss": 0.83213639, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.30151367, + "step": 2924, + "time_per_iteration": 2.7298922538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06950178029529624, + "language_loss": 0.90437222, + "learning_rate": 0.0004230446188382693, + "loss": 0.9151001, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.30761719, + "step": 2925, + "time_per_iteration": 2.533452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.04133308, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.061159313769390204, + "language_loss": 0.80411077, + "learning_rate": 0.0004227368035904654, + "loss": 0.81483406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.30957031, + "step": 2926, + "time_per_iteration": 2.953749895095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04001379, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.05619049718209651, + "language_loss": 0.82702053, + "learning_rate": 0.00042242901833889474, + "loss": 0.83772445, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30322266, + "step": 2927, + "time_per_iteration": 2.6141388416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079835, + "balance_loss_mlp": 1.04977047, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.06403217415420936, + "language_loss": 0.86264247, + "learning_rate": 0.0004221212632030501, + "loss": 0.8734408, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.30004883, + "step": 2928, + "time_per_iteration": 3.0815889835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079959, + "balance_loss_mlp": 1.04953694, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.0586888061552407, + "language_loss": 0.7995134, + "learning_rate": 0.0004218135383024124, + "loss": 0.81031299, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30395508, + "step": 2929, + "time_per_iteration": 2.7041475772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074718, + "balance_loss_mlp": 1.04417634, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.06027811401713532, + "language_loss": 0.84979665, + "learning_rate": 0.0004215058437564511, + "loss": 0.86054391, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.30493164, + "step": 2930, + "time_per_iteration": 2.5627479553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074654, + "balance_loss_mlp": 1.04427934, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.054381619158741505, + "language_loss": 0.8244099, + "learning_rate": 0.00042119817968462397, + "loss": 0.83515644, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.30322266, + "step": 2931, + "time_per_iteration": 2.5824992656707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076007, + "balance_loss_mlp": 1.04517913, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06458971753482587, + "language_loss": 0.86743045, + "learning_rate": 0.0004208905462063766, + "loss": 0.87819058, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.30786133, + "step": 2932, + "time_per_iteration": 2.6889755725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075474, + "balance_loss_mlp": 1.04447937, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.05636003677155103, + "language_loss": 0.84317416, + "learning_rate": 0.00042058294344114315, + "loss": 0.85392892, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.30957031, + "step": 2933, + "time_per_iteration": 2.626492500305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073066, + "balance_loss_mlp": 1.0428108, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05419859074132438, + "language_loss": 0.77552223, + "learning_rate": 0.0004202753715083456, + "loss": 0.78625292, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.30224609, + "step": 2934, + "time_per_iteration": 3.0855889320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.04767334, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.0600578906837947, + "language_loss": 0.81160748, + "learning_rate": 0.0004199678305273936, + "loss": 0.8223865, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30200195, + "step": 2935, + "time_per_iteration": 2.680676221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072428, + "balance_loss_mlp": 1.04176772, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.07403764487671594, + "language_loss": 0.81138289, + "learning_rate": 0.0004196603206176854, + "loss": 0.8221072, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.30615234, + "step": 2936, + "time_per_iteration": 2.930933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_mlp": 1.05526328, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.06763515513860026, + "language_loss": 0.8344292, + "learning_rate": 0.000419352841898607, + "loss": 0.8452751, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29272461, + "step": 2937, + "time_per_iteration": 2.983389377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04714775, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.06159153322850295, + "language_loss": 0.77355075, + "learning_rate": 0.000419045394489532, + "loss": 0.78431857, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29589844, + "step": 2938, + "time_per_iteration": 2.7125768661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082739, + "balance_loss_mlp": 1.05229306, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.051986884313783496, + "language_loss": 0.76774859, + "learning_rate": 0.0004187379785098224, + "loss": 0.77857602, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.30395508, + "step": 2939, + "time_per_iteration": 3.127896547317505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04854691, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05965997721506439, + "language_loss": 0.83921504, + "learning_rate": 0.00041843059407882744, + "loss": 0.85000205, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.30126953, + "step": 2940, + "time_per_iteration": 2.97220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010812, + "balance_loss_mlp": 1.05113554, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05367108270531433, + "language_loss": 0.82534146, + "learning_rate": 0.0004181232413158842, + "loss": 0.83615345, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.30004883, + "step": 2941, + "time_per_iteration": 2.642336368560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_mlp": 1.05405188, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06412651995290534, + "language_loss": 0.82513189, + "learning_rate": 0.0004178159203403179, + "loss": 0.83596516, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29272461, + "step": 2942, + "time_per_iteration": 2.856449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082217, + "balance_loss_mlp": 1.05260575, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.056771241115104176, + "language_loss": 0.81273901, + "learning_rate": 0.0004175086312714409, + "loss": 0.82356119, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.2956543, + "step": 2943, + "time_per_iteration": 2.62709903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088098, + "balance_loss_mlp": 1.05898714, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.050224853353863855, + "language_loss": 0.83679438, + "learning_rate": 0.00041720137422855366, + "loss": 0.84767538, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.29052734, + "step": 2944, + "time_per_iteration": 2.730576515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_mlp": 1.05710077, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.0578384318096137, + "language_loss": 0.78684467, + "learning_rate": 0.00041689414933094383, + "loss": 0.79770631, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.2902832, + "step": 2945, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_mlp": 1.05483007, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.061631419209263724, + "language_loss": 0.80986917, + "learning_rate": 0.00041658695669788653, + "loss": 0.82071877, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.30102539, + "step": 2946, + "time_per_iteration": 2.766889810562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083037, + "balance_loss_mlp": 1.05352092, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.08686938236765575, + "language_loss": 0.81373537, + "learning_rate": 0.00041627979644864453, + "loss": 0.82456571, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.29467773, + "step": 2947, + "time_per_iteration": 2.7937870025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085685, + "balance_loss_mlp": 1.0563122, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.05686002455066826, + "language_loss": 0.81299067, + "learning_rate": 0.0004159726687024683, + "loss": 0.82384753, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.29345703, + "step": 2948, + "time_per_iteration": 2.636784791946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05417752, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.057207156589959604, + "language_loss": 0.7857877, + "learning_rate": 0.00041566557357859506, + "loss": 0.79662293, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.29321289, + "step": 2949, + "time_per_iteration": 2.8607821464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.05131269, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.050618871180039625, + "language_loss": 0.79166919, + "learning_rate": 0.0004153585111962502, + "loss": 0.802477, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.29443359, + "step": 2950, + "time_per_iteration": 3.306715250015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05387974, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.08196542197504524, + "language_loss": 0.84189069, + "learning_rate": 0.0004150514816746453, + "loss": 0.85272491, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.29492188, + "step": 2951, + "time_per_iteration": 2.6732659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_mlp": 1.05190265, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.06474663434913709, + "language_loss": 0.85581088, + "learning_rate": 0.0004147444851329802, + "loss": 0.86662048, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29003906, + "step": 2952, + "time_per_iteration": 2.647568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079758, + "balance_loss_mlp": 1.05081391, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.0574748240063073, + "language_loss": 0.85410154, + "learning_rate": 0.00041443752169044126, + "loss": 0.8648991, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.28955078, + "step": 2953, + "time_per_iteration": 3.018815040588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081341, + "balance_loss_mlp": 1.05227828, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05380576703697579, + "language_loss": 0.846789, + "learning_rate": 0.0004141305914662025, + "loss": 0.85760248, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.29052734, + "step": 2954, + "time_per_iteration": 2.7356324195861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088016, + "balance_loss_mlp": 1.05807066, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.05392421630137883, + "language_loss": 0.80538452, + "learning_rate": 0.0004138236945794246, + "loss": 0.81626463, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.29907227, + "step": 2955, + "time_per_iteration": 2.8904106616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082907, + "balance_loss_mlp": 1.05439222, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.07320613099583566, + "language_loss": 0.83898306, + "learning_rate": 0.00041351683114925576, + "loss": 0.84981215, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.28491211, + "step": 2956, + "time_per_iteration": 3.0756330490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_mlp": 1.05683398, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.05933823821942172, + "language_loss": 0.86556458, + "learning_rate": 0.0004132100012948308, + "loss": 0.87642407, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.29077148, + "step": 2957, + "time_per_iteration": 2.6803860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.05614674, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.06187903851247569, + "language_loss": 0.84050244, + "learning_rate": 0.00041290320513527145, + "loss": 0.85135645, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.29248047, + "step": 2958, + "time_per_iteration": 2.54225754737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05545211, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04955077863713089, + "language_loss": 0.85089266, + "learning_rate": 0.0004125964427896867, + "loss": 0.86173952, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29199219, + "step": 2959, + "time_per_iteration": 2.716848611831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.0530802, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.0635030186812047, + "language_loss": 0.79277623, + "learning_rate": 0.0004122897143771723, + "loss": 0.80361056, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.30297852, + "step": 2960, + "time_per_iteration": 2.53230357170105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086179, + "balance_loss_mlp": 1.05628169, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.052407613892641675, + "language_loss": 0.81192493, + "learning_rate": 0.0004119830200168109, + "loss": 0.82278675, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.29858398, + "step": 2961, + "time_per_iteration": 2.684126377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.05355775, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.06121192976286501, + "language_loss": 0.88053119, + "learning_rate": 0.0004116763598276714, + "loss": 0.89136672, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.29956055, + "step": 2962, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108181, + "balance_loss_mlp": 1.05138803, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.069996546899228, + "language_loss": 0.8081792, + "learning_rate": 0.00041136973392881017, + "loss": 0.81899732, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.30395508, + "step": 2963, + "time_per_iteration": 2.8093085289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05357933, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.06390032386968057, + "language_loss": 0.8227576, + "learning_rate": 0.00041106314243926983, + "loss": 0.8335923, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.29858398, + "step": 2964, + "time_per_iteration": 2.740004062652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080188, + "balance_loss_mlp": 1.05062366, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.060533570265575896, + "language_loss": 0.87250763, + "learning_rate": 0.0004107565854780798, + "loss": 0.88330954, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29516602, + "step": 2965, + "time_per_iteration": 2.6749136447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080245, + "balance_loss_mlp": 1.05111039, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.06664541213513904, + "language_loss": 0.80888879, + "learning_rate": 0.000410450063164256, + "loss": 0.81969118, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29077148, + "step": 2966, + "time_per_iteration": 2.8448963165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081067, + "balance_loss_mlp": 1.05081153, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.06804112412049489, + "language_loss": 0.82108605, + "learning_rate": 0.00041014357561680115, + "loss": 0.83189678, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30200195, + "step": 2967, + "time_per_iteration": 2.5226550102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_mlp": 1.0544889, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.059986306134107735, + "language_loss": 0.86107051, + "learning_rate": 0.0004098371229547039, + "loss": 0.87191176, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.29589844, + "step": 2968, + "time_per_iteration": 2.7232651710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_mlp": 1.03398585, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.025451731838023718, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81057, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.12207031, + "step": 2969, + "time_per_iteration": 4.785320997238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082869, + "balance_loss_mlp": 1.05330527, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.07178133530641487, + "language_loss": 0.80500889, + "learning_rate": 0.00040922432276247107, + "loss": 0.81583756, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.29516602, + "step": 2970, + "time_per_iteration": 2.5877230167388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086085, + "balance_loss_mlp": 1.05635428, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.05561639186548029, + "language_loss": 0.84452176, + "learning_rate": 0.0004089179754702457, + "loss": 0.85538256, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.29663086, + "step": 2971, + "time_per_iteration": 2.759932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084469, + "balance_loss_mlp": 1.05469072, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.05716809371830958, + "language_loss": 0.79499936, + "learning_rate": 0.00040861166353919843, + "loss": 0.80584407, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.29711914, + "step": 2972, + "time_per_iteration": 2.856147050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080407, + "balance_loss_mlp": 1.05213094, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.054720530113361164, + "language_loss": 0.81279707, + "learning_rate": 0.00040830538708824983, + "loss": 0.82360113, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.28295898, + "step": 2973, + "time_per_iteration": 2.9099643230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05414152, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.059341772904328634, + "language_loss": 0.81557322, + "learning_rate": 0.000407999146236307, + "loss": 0.82641, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29492188, + "step": 2974, + "time_per_iteration": 2.5506579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_mlp": 1.05807054, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.05823834072467256, + "language_loss": 0.8320694, + "learning_rate": 0.0004076929411022634, + "loss": 0.84294319, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.29248047, + "step": 2975, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.05125356, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.059359253337435705, + "language_loss": 0.79102635, + "learning_rate": 0.0004073867718049982, + "loss": 0.80183673, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.29736328, + "step": 2976, + "time_per_iteration": 3.104320526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087781, + "balance_loss_mlp": 1.05745435, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.06002278348442279, + "language_loss": 0.82387239, + "learning_rate": 0.00040708063846337704, + "loss": 0.83475018, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.30273438, + "step": 2977, + "time_per_iteration": 2.7141377925872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.05906403, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.05629415234265891, + "language_loss": 0.81140733, + "learning_rate": 0.00040677454119625143, + "loss": 0.82229173, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.29321289, + "step": 2978, + "time_per_iteration": 2.5579118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.04967451, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.06287623577372331, + "language_loss": 0.82978582, + "learning_rate": 0.0004064684801224587, + "loss": 0.84058082, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.2980957, + "step": 2979, + "time_per_iteration": 2.6184630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080607, + "balance_loss_mlp": 1.05047131, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.049858532305801305, + "language_loss": 0.80364764, + "learning_rate": 0.00040616245536082224, + "loss": 0.81445372, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30078125, + "step": 2980, + "time_per_iteration": 2.605652093887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.04602742, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.05649585275193457, + "language_loss": 0.81399214, + "learning_rate": 0.00040585646703015165, + "loss": 0.82474685, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29418945, + "step": 2981, + "time_per_iteration": 2.8440651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081482, + "balance_loss_mlp": 1.05103636, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.0633133856450646, + "language_loss": 0.78068441, + "learning_rate": 0.0004055505152492419, + "loss": 0.79149926, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.30419922, + "step": 2982, + "time_per_iteration": 2.7125117778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076312, + "balance_loss_mlp": 1.0467, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.057765721767923175, + "language_loss": 0.74208528, + "learning_rate": 0.00040524460013687425, + "loss": 0.75284839, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.29589844, + "step": 2983, + "time_per_iteration": 2.7232775688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05151832, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.049591997410844156, + "language_loss": 0.81157619, + "learning_rate": 0.0004049387218118155, + "loss": 0.82238322, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.29199219, + "step": 2984, + "time_per_iteration": 2.956636428833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080147, + "balance_loss_mlp": 1.04934323, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.06847869877575175, + "language_loss": 0.84987867, + "learning_rate": 0.00040463288039281777, + "loss": 0.8606801, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30761719, + "step": 2985, + "time_per_iteration": 2.7503554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00078201, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.012095267017415088, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78889978, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.12792969, + "step": 2986, + "time_per_iteration": 5.030332565307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079255, + "balance_loss_mlp": 1.04981041, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.055809040190366505, + "language_loss": 0.82136881, + "learning_rate": 0.0004040213087479444, + "loss": 0.83216131, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.29443359, + "step": 2987, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087088, + "balance_loss_mlp": 1.05816782, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.06868722002267488, + "language_loss": 0.85331053, + "learning_rate": 0.0004037155787595018, + "loss": 0.8641814, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.28857422, + "step": 2988, + "time_per_iteration": 2.561497211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085606, + "balance_loss_mlp": 1.05599451, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.05119655910511677, + "language_loss": 0.80321741, + "learning_rate": 0.000403409886151987, + "loss": 0.81407344, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29589844, + "step": 2989, + "time_per_iteration": 2.9114019870758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013296, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.008836939301122537, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83012402, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.12695312, + "step": 2990, + "time_per_iteration": 4.770756483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_mlp": 1.00086439, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.007697309180098509, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79211962, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.125, + "step": 2991, + "time_per_iteration": 4.786288499832153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_mlp": 1.05537939, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05348004588160335, + "language_loss": 0.76926208, + "learning_rate": 0.00040249303380173807, + "loss": 0.78009981, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.28369141, + "step": 2992, + "time_per_iteration": 3.0660438537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_mlp": 1.05629849, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.06048493616630367, + "language_loss": 0.79311389, + "learning_rate": 0.00040218749190459126, + "loss": 0.80396485, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.28808594, + "step": 2993, + "time_per_iteration": 2.7251527309417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084541, + "balance_loss_mlp": 1.05514371, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.0697186971943442, + "language_loss": 0.82477212, + "learning_rate": 0.00040188198798162775, + "loss": 0.83561754, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29370117, + "step": 2994, + "time_per_iteration": 2.6159136295318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05147123, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.057556686362034246, + "language_loss": 0.85848254, + "learning_rate": 0.000401576522151455, + "loss": 0.86929381, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29614258, + "step": 2995, + "time_per_iteration": 2.811438798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05775023, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04540215088386673, + "language_loss": 0.82446247, + "learning_rate": 0.0004012710945326651, + "loss": 0.83532608, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.28613281, + "step": 2996, + "time_per_iteration": 2.778818368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.05790055, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.049519109180824444, + "language_loss": 0.81129038, + "learning_rate": 0.0004009657052438355, + "loss": 0.82215673, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28686523, + "step": 2997, + "time_per_iteration": 2.8787920475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094954, + "balance_loss_mlp": 1.06612968, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.05906428447956742, + "language_loss": 0.85482752, + "learning_rate": 0.00040066035440352904, + "loss": 0.86577708, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.2878418, + "step": 2998, + "time_per_iteration": 2.634565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.03379035, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.021537766013807906, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80338895, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.11962891, + "step": 2999, + "time_per_iteration": 4.964475393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090784, + "balance_loss_mlp": 1.06248331, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.06837432109358414, + "language_loss": 0.75964624, + "learning_rate": 0.00040004976854266145, + "loss": 0.77055407, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.28295898, + "step": 3000, + "time_per_iteration": 2.5489282608032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.06006408, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.0545980885089623, + "language_loss": 0.81222647, + "learning_rate": 0.0003997445337591505, + "loss": 0.82312131, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.29370117, + "step": 3001, + "time_per_iteration": 2.6890947818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108546, + "balance_loss_mlp": 1.05680251, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.06583721131765849, + "language_loss": 0.74093473, + "learning_rate": 0.0003994393378982635, + "loss": 0.75178933, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28662109, + "step": 3002, + "time_per_iteration": 2.596644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_mlp": 1.03153443, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.017943105040569007, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80581129, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11572266, + "step": 3003, + "time_per_iteration": 4.826138257980347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085564, + "balance_loss_mlp": 1.05666792, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.058273014851323426, + "language_loss": 0.87901747, + "learning_rate": 0.0003988290634182961, + "loss": 0.88987309, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.28881836, + "step": 3004, + "time_per_iteration": 2.7604172229766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06015372, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.06327449394997672, + "language_loss": 0.80677181, + "learning_rate": 0.0003985239850361453, + "loss": 0.81765187, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.27856445, + "step": 3005, + "time_per_iteration": 2.5994105339050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.06256592, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.057065414052448256, + "language_loss": 0.84621793, + "learning_rate": 0.0003982189460504777, + "loss": 0.85713327, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.28930664, + "step": 3006, + "time_per_iteration": 2.722778797149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.06261778, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.0654169545720973, + "language_loss": 0.79183024, + "learning_rate": 0.00039791394657971935, + "loss": 0.80274087, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.28442383, + "step": 3007, + "time_per_iteration": 2.7318689823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.06056237, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.06429658550493057, + "language_loss": 0.84402883, + "learning_rate": 0.00039760898674228205, + "loss": 0.85492396, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.28930664, + "step": 3008, + "time_per_iteration": 2.6548941135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.05884826, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.0525681924040606, + "language_loss": 0.80782068, + "learning_rate": 0.0003973040666565613, + "loss": 0.81869543, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.28588867, + "step": 3009, + "time_per_iteration": 3.065049171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087663, + "balance_loss_mlp": 1.05972004, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.058928126410829465, + "language_loss": 0.81879556, + "learning_rate": 0.000396999186440938, + "loss": 0.82967222, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.27954102, + "step": 3010, + "time_per_iteration": 2.860755205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086781, + "balance_loss_mlp": 1.05871928, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06775550082118927, + "language_loss": 0.84739363, + "learning_rate": 0.000396694346213777, + "loss": 0.85826147, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.28076172, + "step": 3011, + "time_per_iteration": 2.591801643371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077556, + "balance_loss_mlp": 1.04815888, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.09075774540794283, + "language_loss": 0.83682388, + "learning_rate": 0.0003963895460934276, + "loss": 0.84759945, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.29370117, + "step": 3012, + "time_per_iteration": 3.1549274921417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_mlp": 1.05242133, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.07824771870324425, + "language_loss": 0.85031927, + "learning_rate": 0.00039608478619822376, + "loss": 0.86112702, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.28344727, + "step": 3013, + "time_per_iteration": 2.436859369277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108003, + "balance_loss_mlp": 1.05091906, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.07454312954276684, + "language_loss": 0.82720006, + "learning_rate": 0.00039578006664648394, + "loss": 0.83800036, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.29125977, + "step": 3014, + "time_per_iteration": 2.813934326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.05350864, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.07429538018047967, + "language_loss": 0.81169355, + "learning_rate": 0.0003954753875565105, + "loss": 0.82251996, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.29101562, + "step": 3015, + "time_per_iteration": 3.089141607284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.04674578, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.053240000714227444, + "language_loss": 0.8237859, + "learning_rate": 0.00039517074904659057, + "loss": 0.8345452, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.29125977, + "step": 3016, + "time_per_iteration": 2.7315711975097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.05217314, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0618256833307492, + "language_loss": 0.84621388, + "learning_rate": 0.00039486615123499535, + "loss": 0.85702527, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.28955078, + "step": 3017, + "time_per_iteration": 2.870152235031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082579, + "balance_loss_mlp": 1.05342066, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.06092979313789558, + "language_loss": 0.85065556, + "learning_rate": 0.00039456159423997996, + "loss": 0.86148143, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.29125977, + "step": 3018, + "time_per_iteration": 2.6494932174682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04867649, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.05170574080230249, + "language_loss": 0.89520943, + "learning_rate": 0.00039425707817978406, + "loss": 0.90599209, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29541016, + "step": 3019, + "time_per_iteration": 2.690485715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_mlp": 1.04894376, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06031161665678942, + "language_loss": 0.83372945, + "learning_rate": 0.00039395260317263124, + "loss": 0.84451568, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.29663086, + "step": 3020, + "time_per_iteration": 2.677818775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076598, + "balance_loss_mlp": 1.0466764, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.056782275650517425, + "language_loss": 0.84907949, + "learning_rate": 0.0003936481693367291, + "loss": 0.8598454, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.29882812, + "step": 3021, + "time_per_iteration": 2.647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_mlp": 1.05491698, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06733027879749674, + "language_loss": 0.87502337, + "learning_rate": 0.0003933437767902697, + "loss": 0.88587123, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.29833984, + "step": 3022, + "time_per_iteration": 2.825965166091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085273, + "balance_loss_mlp": 1.05706787, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.07318564796931465, + "language_loss": 0.78165317, + "learning_rate": 0.00039303942565142825, + "loss": 0.79250592, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.28222656, + "step": 3023, + "time_per_iteration": 2.7315845489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_mlp": 1.0569042, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.052544940996134284, + "language_loss": 0.76741624, + "learning_rate": 0.0003927351160383644, + "loss": 0.77829051, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.3046875, + "step": 3024, + "time_per_iteration": 2.789477825164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085705, + "balance_loss_mlp": 1.05609322, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.07634686348045291, + "language_loss": 0.77796662, + "learning_rate": 0.000392430848069222, + "loss": 0.78882366, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.5446279048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085632, + "balance_loss_mlp": 1.05549598, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05528071963535831, + "language_loss": 0.82223105, + "learning_rate": 0.00039212662186212795, + "loss": 0.83308738, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.30078125, + "step": 3026, + "time_per_iteration": 2.60878849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_mlp": 1.04883003, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.05052748911564131, + "language_loss": 0.76906562, + "learning_rate": 0.0003918224375351934, + "loss": 0.77986145, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.30737305, + "step": 3027, + "time_per_iteration": 2.709887742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_mlp": 1.05384469, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05874903473435042, + "language_loss": 0.78473544, + "learning_rate": 0.0003915182952065135, + "loss": 0.79556859, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29418945, + "step": 3028, + "time_per_iteration": 2.6885859966278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.05250072, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.06824855227929012, + "language_loss": 0.8751812, + "learning_rate": 0.0003912141949941664, + "loss": 0.88600326, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.296875, + "step": 3029, + "time_per_iteration": 2.7145774364471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05799532, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.07682913079591057, + "language_loss": 0.82808822, + "learning_rate": 0.0003909101370162143, + "loss": 0.83896548, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.29711914, + "step": 3030, + "time_per_iteration": 2.6085238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063086, + "balance_loss_mlp": 1.05116475, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.03433679117263603, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73496974, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.11914062, + "step": 3031, + "time_per_iteration": 4.894438028335571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05076766, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.0542485247275347, + "language_loss": 0.8270607, + "learning_rate": 0.0003903021482356622, + "loss": 0.83786714, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29833984, + "step": 3032, + "time_per_iteration": 2.8060503005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079071, + "balance_loss_mlp": 1.04924476, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.06913224268253564, + "language_loss": 0.8243112, + "learning_rate": 0.00038999821766910465, + "loss": 0.8351019, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.2980957, + "step": 3033, + "time_per_iteration": 3.013117551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079849, + "balance_loss_mlp": 1.04992783, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.06539568057172108, + "language_loss": 0.85596031, + "learning_rate": 0.00038969432980902606, + "loss": 0.86675882, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.29907227, + "step": 3034, + "time_per_iteration": 2.602159261703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.03642654, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.02505289654727371, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.8083204, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11132812, + "step": 3035, + "time_per_iteration": 4.8551225662231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05664897, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.05971096981290547, + "language_loss": 0.82545829, + "learning_rate": 0.00038908668268020953, + "loss": 0.8363204, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29516602, + "step": 3036, + "time_per_iteration": 2.6712634563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084003, + "balance_loss_mlp": 1.05455875, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.06020630991976339, + "language_loss": 0.84750116, + "learning_rate": 0.00038878292364738097, + "loss": 0.85834116, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.29418945, + "step": 3037, + "time_per_iteration": 2.774688959121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087202, + "balance_loss_mlp": 1.05785298, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.06330434972052289, + "language_loss": 0.87235534, + "learning_rate": 0.0003884792077928508, + "loss": 0.88322735, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.511212110519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05957842, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.089824175631678, + "language_loss": 0.76556516, + "learning_rate": 0.0003881755352345322, + "loss": 0.77645469, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29345703, + "step": 3039, + "time_per_iteration": 2.5297422409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108977, + "balance_loss_mlp": 1.06039691, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.05409760120739159, + "language_loss": 0.8652333, + "learning_rate": 0.0003878719060903207, + "loss": 0.87613106, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29345703, + "step": 3040, + "time_per_iteration": 2.5606369972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_mlp": 1.05447245, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.07864155094531469, + "language_loss": 0.83092105, + "learning_rate": 0.0003875683204780961, + "loss": 0.84176469, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29833984, + "step": 3041, + "time_per_iteration": 2.7069876194000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_mlp": 1.06128943, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.07084084705837652, + "language_loss": 0.85393965, + "learning_rate": 0.00038726477851572043, + "loss": 0.86485463, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.30175781, + "step": 3042, + "time_per_iteration": 2.785623788833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086169, + "balance_loss_mlp": 1.0566287, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.06883779110535396, + "language_loss": 0.80354905, + "learning_rate": 0.0003869612803210395, + "loss": 0.81441069, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.29541016, + "step": 3043, + "time_per_iteration": 2.635880708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075998, + "balance_loss_mlp": 1.04643369, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.0705585022393511, + "language_loss": 0.83492166, + "learning_rate": 0.0003866578260118817, + "loss": 0.84568161, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29541016, + "step": 3044, + "time_per_iteration": 2.58337664604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074571, + "balance_loss_mlp": 1.04491138, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.06598081480709424, + "language_loss": 0.83220106, + "learning_rate": 0.0003863544157060581, + "loss": 0.84294677, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.29614258, + "step": 3045, + "time_per_iteration": 2.66916561126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079474, + "balance_loss_mlp": 1.04998136, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.05207738102195899, + "language_loss": 0.82137144, + "learning_rate": 0.0003860510495213634, + "loss": 0.83216619, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.29492188, + "step": 3046, + "time_per_iteration": 2.8170437812805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04256272, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.07713217072038757, + "language_loss": 0.78373164, + "learning_rate": 0.0003857477275755746, + "loss": 0.79445338, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.29589844, + "step": 3047, + "time_per_iteration": 2.639801502227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077446, + "balance_loss_mlp": 1.04678559, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.05564403415338841, + "language_loss": 0.84011877, + "learning_rate": 0.00038544444998645167, + "loss": 0.8508932, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.30639648, + "step": 3048, + "time_per_iteration": 3.007289409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_mlp": 1.04754782, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.06801965614795764, + "language_loss": 0.81586641, + "learning_rate": 0.00038514121687173767, + "loss": 0.8266356, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.29345703, + "step": 3049, + "time_per_iteration": 2.637277603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072965, + "balance_loss_mlp": 1.04397368, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.0576990751755922, + "language_loss": 0.81892288, + "learning_rate": 0.00038483802834915807, + "loss": 0.82965243, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.28979492, + "step": 3050, + "time_per_iteration": 2.975592613220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075399, + "balance_loss_mlp": 1.04607356, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.09338183491699942, + "language_loss": 0.78599441, + "learning_rate": 0.00038453488453642074, + "loss": 0.79674846, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29296875, + "step": 3051, + "time_per_iteration": 2.668680429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_mlp": 1.04581618, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.18186948375192843, + "language_loss": 0.86825669, + "learning_rate": 0.00038423178555121697, + "loss": 0.87900746, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.29223633, + "step": 3052, + "time_per_iteration": 2.7119386196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080518, + "balance_loss_mlp": 1.05202711, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.05190046933032045, + "language_loss": 0.85228276, + "learning_rate": 0.00038392873151121994, + "loss": 0.86308795, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.28466797, + "step": 3053, + "time_per_iteration": 3.0532052516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075316, + "balance_loss_mlp": 1.04615784, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.06073215036153007, + "language_loss": 0.830441, + "learning_rate": 0.0003836257225340859, + "loss": 0.84119415, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.29125977, + "step": 3054, + "time_per_iteration": 2.6791739463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077784, + "balance_loss_mlp": 1.04922152, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.053654559033963406, + "language_loss": 0.82283098, + "learning_rate": 0.00038332275873745336, + "loss": 0.83360887, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.28564453, + "step": 3055, + "time_per_iteration": 3.0826737880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085261, + "balance_loss_mlp": 1.05646038, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.07874067829632751, + "language_loss": 0.82649648, + "learning_rate": 0.0003830198402389431, + "loss": 0.83734912, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.28759766, + "step": 3056, + "time_per_iteration": 2.71244215965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.06755841, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.03508304466376378, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78429663, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.13183594, + "step": 3057, + "time_per_iteration": 4.991718053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.05900002, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.0604575145753954, + "language_loss": 0.83162987, + "learning_rate": 0.0003824141396066855, + "loss": 0.84250164, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28198242, + "step": 3058, + "time_per_iteration": 2.62410044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095213, + "balance_loss_mlp": 1.06605411, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05748148757470156, + "language_loss": 0.83195531, + "learning_rate": 0.000382111357708092, + "loss": 0.84290743, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29125977, + "step": 3059, + "time_per_iteration": 2.741142511367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099933, + "balance_loss_mlp": 1.07113242, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.07210182052791281, + "language_loss": 0.83736324, + "learning_rate": 0.00038180862157792864, + "loss": 0.84836257, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.28808594, + "step": 3060, + "time_per_iteration": 2.8028531074523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095663, + "balance_loss_mlp": 1.06733847, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.06185538750618477, + "language_loss": 0.82032192, + "learning_rate": 0.0003815059313337279, + "loss": 0.83127856, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28295898, + "step": 3061, + "time_per_iteration": 2.661663055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.0641377, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.054152956568787894, + "language_loss": 0.78217703, + "learning_rate": 0.00038120328709300436, + "loss": 0.7931028, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.28466797, + "step": 3062, + "time_per_iteration": 2.8524019718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_mlp": 1.0717572, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.07045144115382113, + "language_loss": 0.83619386, + "learning_rate": 0.0003809006889732549, + "loss": 0.84719896, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.28759766, + "step": 3063, + "time_per_iteration": 2.818297863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093698, + "balance_loss_mlp": 1.06554079, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.07166208719676233, + "language_loss": 0.87752122, + "learning_rate": 0.0003805981370919589, + "loss": 0.88845825, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28173828, + "step": 3064, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06352103, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.052273370645306905, + "language_loss": 0.83554685, + "learning_rate": 0.0003802956315665771, + "loss": 0.84646511, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28320312, + "step": 3065, + "time_per_iteration": 2.7017621994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091683, + "balance_loss_mlp": 1.06428885, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.09115739101573021, + "language_loss": 0.81856883, + "learning_rate": 0.0003799931725145529, + "loss": 0.82948571, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.27416992, + "step": 3066, + "time_per_iteration": 2.6396725177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091771, + "balance_loss_mlp": 1.0635426, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.061744960378181175, + "language_loss": 0.85826695, + "learning_rate": 0.00037969076005331083, + "loss": 0.86918467, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28271484, + "step": 3067, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05947697, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.062191843713449865, + "language_loss": 0.87458771, + "learning_rate": 0.00037938839430025817, + "loss": 0.88547218, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.28930664, + "step": 3068, + "time_per_iteration": 2.645289897918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080639, + "balance_loss_mlp": 1.0527916, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.07692636502028646, + "language_loss": 0.85409123, + "learning_rate": 0.0003790860753727835, + "loss": 0.86489761, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.27856445, + "step": 3069, + "time_per_iteration": 2.831932544708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.05966043, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.05698566021180351, + "language_loss": 0.82950222, + "learning_rate": 0.00037878380338825766, + "loss": 0.84037948, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28076172, + "step": 3070, + "time_per_iteration": 2.6856610774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094092, + "balance_loss_mlp": 1.06655455, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.05699607440456078, + "language_loss": 0.81377411, + "learning_rate": 0.00037848157846403287, + "loss": 0.82471496, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.27539062, + "step": 3071, + "time_per_iteration": 2.9222235679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090999, + "balance_loss_mlp": 1.06291366, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04993960868235579, + "language_loss": 0.8303259, + "learning_rate": 0.0003781794007174435, + "loss": 0.84123588, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.28076172, + "step": 3072, + "time_per_iteration": 2.8049426078796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.03702164, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.02139881306535856, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7512219, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.860798597335815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05854619, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.0539637393269004, + "language_loss": 0.81219113, + "learning_rate": 0.0003775751872264152, + "loss": 0.8230564, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.28027344, + "step": 3074, + "time_per_iteration": 2.7820684909820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05267119, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.057314841017187666, + "language_loss": 0.87226552, + "learning_rate": 0.0003772731517165527, + "loss": 0.88307905, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28686523, + "step": 3075, + "time_per_iteration": 2.8264849185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.05383801, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06214529816255618, + "language_loss": 0.83813703, + "learning_rate": 0.0003769711638534784, + "loss": 0.84896386, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28857422, + "step": 3076, + "time_per_iteration": 2.9739084243774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107611, + "balance_loss_mlp": 1.04769087, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06330128127303343, + "language_loss": 0.78904676, + "learning_rate": 0.00037666922375443446, + "loss": 0.79980791, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28417969, + "step": 3077, + "time_per_iteration": 2.611528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076959, + "balance_loss_mlp": 1.04815805, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.0824489675783013, + "language_loss": 0.81633419, + "learning_rate": 0.00037636733153664396, + "loss": 0.82710373, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.2878418, + "step": 3078, + "time_per_iteration": 2.830021619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074589, + "balance_loss_mlp": 1.04547811, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.07220859459639119, + "language_loss": 0.79744393, + "learning_rate": 0.0003760654873173124, + "loss": 0.80818975, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.29077148, + "step": 3079, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069705, + "balance_loss_mlp": 1.04047441, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.0611483797885387, + "language_loss": 0.81661952, + "learning_rate": 0.00037576369121362566, + "loss": 0.82731652, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.29174805, + "step": 3080, + "time_per_iteration": 2.6135458946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073309, + "balance_loss_mlp": 1.0437448, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05261928263256693, + "language_loss": 0.81494981, + "learning_rate": 0.0003754619433427516, + "loss": 0.82568288, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29516602, + "step": 3081, + "time_per_iteration": 2.935394763946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_mlp": 1.04502153, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.07109600442573788, + "language_loss": 0.77291781, + "learning_rate": 0.0003751602438218392, + "loss": 0.78366369, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.29516602, + "step": 3082, + "time_per_iteration": 2.762129306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107369, + "balance_loss_mlp": 1.04410219, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.07081310094320947, + "language_loss": 0.83719951, + "learning_rate": 0.0003748585927680186, + "loss": 0.84793639, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.29589844, + "step": 3083, + "time_per_iteration": 2.6607072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072302, + "balance_loss_mlp": 1.04126024, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.09668658910416093, + "language_loss": 0.82859874, + "learning_rate": 0.00037455699029840086, + "loss": 0.83932179, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.31005859, + "step": 3084, + "time_per_iteration": 2.641989231109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069753, + "balance_loss_mlp": 1.04014122, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.04958887884439868, + "language_loss": 0.84485245, + "learning_rate": 0.0003742554365300787, + "loss": 0.85554999, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.2956543, + "step": 3085, + "time_per_iteration": 2.8070170879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.0440923, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.06324229056117828, + "language_loss": 0.78341657, + "learning_rate": 0.0003739539315801255, + "loss": 0.79416168, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.30371094, + "step": 3086, + "time_per_iteration": 2.937530755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076236, + "balance_loss_mlp": 1.04571867, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.06251001537840323, + "language_loss": 0.91790974, + "learning_rate": 0.000373652475565596, + "loss": 0.92867219, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.3046875, + "step": 3087, + "time_per_iteration": 2.484830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072731, + "balance_loss_mlp": 1.0422616, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.06825336960690286, + "language_loss": 0.81144977, + "learning_rate": 0.00037335106860352587, + "loss": 0.82217705, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.3046875, + "step": 3088, + "time_per_iteration": 2.705796003341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079924, + "balance_loss_mlp": 1.04938293, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.05943406802659928, + "language_loss": 0.83409536, + "learning_rate": 0.00037304971081093146, + "loss": 0.84489465, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.30517578, + "step": 3089, + "time_per_iteration": 2.5424582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_mlp": 1.05015349, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.06149863143832335, + "language_loss": 0.80616403, + "learning_rate": 0.00037274840230481024, + "loss": 0.81697237, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.30664062, + "step": 3090, + "time_per_iteration": 2.7081451416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073853, + "balance_loss_mlp": 1.04407477, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.06332669517454644, + "language_loss": 0.79229522, + "learning_rate": 0.00037244714320214077, + "loss": 0.80303377, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.29736328, + "step": 3091, + "time_per_iteration": 2.5389420986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.05082965, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.061471299239273844, + "language_loss": 0.83137572, + "learning_rate": 0.000372145933619882, + "loss": 0.84218347, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.29931641, + "step": 3092, + "time_per_iteration": 2.8748533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076811, + "balance_loss_mlp": 1.04657912, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05871713315937548, + "language_loss": 0.82114685, + "learning_rate": 0.000371844773674974, + "loss": 0.8319149, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.30224609, + "step": 3093, + "time_per_iteration": 2.6465840339660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_mlp": 1.05346692, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.0642067113719601, + "language_loss": 0.81621695, + "learning_rate": 0.0003715436634843375, + "loss": 0.82704508, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29345703, + "step": 3094, + "time_per_iteration": 2.9084014892578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079615, + "balance_loss_mlp": 1.05007505, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.04814703484993394, + "language_loss": 0.80545932, + "learning_rate": 0.00037124260316487355, + "loss": 0.81625545, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.29516602, + "step": 3095, + "time_per_iteration": 2.8632538318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075577, + "balance_loss_mlp": 1.04727709, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.060441576418101065, + "language_loss": 0.89618301, + "learning_rate": 0.0003709415928334643, + "loss": 0.90693879, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.28344727, + "step": 3096, + "time_per_iteration": 2.6276299953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_mlp": 1.04813242, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.06311167084488892, + "language_loss": 0.80587751, + "learning_rate": 0.00037064063260697233, + "loss": 0.81665254, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.29345703, + "step": 3097, + "time_per_iteration": 2.893503427505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081151, + "balance_loss_mlp": 1.05151534, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.06048648768573219, + "language_loss": 0.78276408, + "learning_rate": 0.0003703397226022407, + "loss": 0.79357558, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.2956543, + "step": 3098, + "time_per_iteration": 3.0289156436920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_mlp": 1.02305758, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.01734603550218104, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76534188, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.11230469, + "step": 3099, + "time_per_iteration": 4.946389436721802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078376, + "balance_loss_mlp": 1.04978967, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.05865367248717621, + "language_loss": 0.83124352, + "learning_rate": 0.0003697380537253339, + "loss": 0.84202731, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.28564453, + "step": 3100, + "time_per_iteration": 2.674445152282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083272, + "balance_loss_mlp": 1.05492401, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.050984632699602635, + "language_loss": 0.81265384, + "learning_rate": 0.0003694372950867471, + "loss": 0.82348651, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28369141, + "step": 3101, + "time_per_iteration": 2.787538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075715, + "balance_loss_mlp": 1.04772449, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05184746467501943, + "language_loss": 0.77182555, + "learning_rate": 0.0003691365871370976, + "loss": 0.78258264, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.2800293, + "step": 3102, + "time_per_iteration": 3.016934871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080662, + "balance_loss_mlp": 1.05271935, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.06482068820490762, + "language_loss": 0.85340202, + "learning_rate": 0.00036883592999313093, + "loss": 0.8642087, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27978516, + "step": 3103, + "time_per_iteration": 2.689819812774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079629, + "balance_loss_mlp": 1.05218673, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.06496745505902583, + "language_loss": 0.79311585, + "learning_rate": 0.0003685353237715722, + "loss": 0.8039121, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27490234, + "step": 3104, + "time_per_iteration": 2.87333083152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_mlp": 1.05504966, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.051730016495621756, + "language_loss": 0.8144263, + "learning_rate": 0.0003682347685891274, + "loss": 0.82525891, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.28222656, + "step": 3105, + "time_per_iteration": 2.888319730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080866, + "balance_loss_mlp": 1.05228007, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.060164631065922125, + "language_loss": 0.80393469, + "learning_rate": 0.0003679342645624822, + "loss": 0.8147434, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.28564453, + "step": 3106, + "time_per_iteration": 3.0317325592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.0513438, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.057913897832382336, + "language_loss": 0.81649029, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728529, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.28198242, + "step": 3107, + "time_per_iteration": 2.9762744903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_mlp": 1.05519295, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05706871104479872, + "language_loss": 0.79560876, + "learning_rate": 0.0003673334104432347, + "loss": 0.80644441, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.28393555, + "step": 3108, + "time_per_iteration": 2.5976645946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.0530827, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.06092677674045173, + "language_loss": 0.83641863, + "learning_rate": 0.0003670330605839048, + "loss": 0.84723055, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.28125, + "step": 3109, + "time_per_iteration": 2.819420337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082632, + "balance_loss_mlp": 1.05480886, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.0537112811211955, + "language_loss": 0.76695013, + "learning_rate": 0.0003667327623469191, + "loss": 0.77777648, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27832031, + "step": 3110, + "time_per_iteration": 2.766671657562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085165, + "balance_loss_mlp": 1.05753255, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.058546063064310164, + "language_loss": 0.77618361, + "learning_rate": 0.00036643251584886333, + "loss": 0.78703523, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27661133, + "step": 3111, + "time_per_iteration": 2.789184808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077786, + "balance_loss_mlp": 1.05105901, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.054896589550954444, + "language_loss": 0.81872785, + "learning_rate": 0.00036613232120630393, + "loss": 0.82950568, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.26782227, + "step": 3112, + "time_per_iteration": 2.5881965160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081611, + "balance_loss_mlp": 1.05362022, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.07437964171487202, + "language_loss": 0.80355418, + "learning_rate": 0.00036583217853578643, + "loss": 0.81437027, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.27978516, + "step": 3113, + "time_per_iteration": 2.5409529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05457568, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.06261379626444472, + "language_loss": 0.77366924, + "learning_rate": 0.000365532087953837, + "loss": 0.78449941, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.28442383, + "step": 3114, + "time_per_iteration": 3.6426267623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076465, + "balance_loss_mlp": 1.04842734, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.08299057980597005, + "language_loss": 0.88937151, + "learning_rate": 0.00036523204957696065, + "loss": 0.90013611, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.28051758, + "step": 3115, + "time_per_iteration": 2.594581365585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_mlp": 1.05623841, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.06140193987839019, + "language_loss": 0.80620509, + "learning_rate": 0.00036493206352164324, + "loss": 0.81704283, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.27612305, + "step": 3116, + "time_per_iteration": 2.922367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076912, + "balance_loss_mlp": 1.04942214, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05345315057842072, + "language_loss": 0.85505688, + "learning_rate": 0.000364632129904349, + "loss": 0.86582601, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.27514648, + "step": 3117, + "time_per_iteration": 2.765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077238, + "balance_loss_mlp": 1.04884195, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05997451129778301, + "language_loss": 0.77705157, + "learning_rate": 0.00036433224884152283, + "loss": 0.78782398, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.28393555, + "step": 3118, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078485, + "balance_loss_mlp": 1.05032814, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.06439508839737945, + "language_loss": 0.77913392, + "learning_rate": 0.00036403242044958875, + "loss": 0.78991878, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28173828, + "step": 3119, + "time_per_iteration": 2.5515971183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.04563642, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05980235429893482, + "language_loss": 0.91155994, + "learning_rate": 0.0003637326448449507, + "loss": 0.9222945, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.27832031, + "step": 3120, + "time_per_iteration": 2.7075581550598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04651034, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.046913105653204425, + "language_loss": 0.86206967, + "learning_rate": 0.00036343292214399177, + "loss": 0.87282228, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.28735352, + "step": 3121, + "time_per_iteration": 2.8623263835906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076118, + "balance_loss_mlp": 1.04786551, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.08364408748252802, + "language_loss": 0.77170986, + "learning_rate": 0.00036313325246307456, + "loss": 0.782471, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28271484, + "step": 3122, + "time_per_iteration": 2.8064393997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04845548, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05351137159491715, + "language_loss": 0.86973262, + "learning_rate": 0.0003628336359185411, + "loss": 0.88050497, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.28759766, + "step": 3123, + "time_per_iteration": 2.701089859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074232, + "balance_loss_mlp": 1.04545498, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.061635029106804545, + "language_loss": 0.75553113, + "learning_rate": 0.000362534072626713, + "loss": 0.76627344, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.28759766, + "step": 3124, + "time_per_iteration": 2.7586216926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076514, + "balance_loss_mlp": 1.04830909, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05599212147105787, + "language_loss": 0.81046546, + "learning_rate": 0.00036223456270389093, + "loss": 0.82123059, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.2824707, + "step": 3125, + "time_per_iteration": 2.948882818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04442525, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.05186484782469995, + "language_loss": 0.81019723, + "learning_rate": 0.00036193510626635517, + "loss": 0.82094145, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.29980469, + "step": 3126, + "time_per_iteration": 2.671576499938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073529, + "balance_loss_mlp": 1.04410863, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.05950376235873218, + "language_loss": 0.81565017, + "learning_rate": 0.0003616357034303649, + "loss": 0.82638544, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.29370117, + "step": 3127, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074144, + "balance_loss_mlp": 1.04541481, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.048316094410884414, + "language_loss": 0.78690076, + "learning_rate": 0.0003613363543121584, + "loss": 0.79764223, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.28735352, + "step": 3128, + "time_per_iteration": 2.873584508895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04766035, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05627549899999149, + "language_loss": 0.8521632, + "learning_rate": 0.00036103705902795357, + "loss": 0.8629328, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.29248047, + "step": 3129, + "time_per_iteration": 2.721329689025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074169, + "balance_loss_mlp": 1.04434288, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.06933558951012796, + "language_loss": 0.7955035, + "learning_rate": 0.0003607378176939471, + "loss": 0.80624521, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.29785156, + "step": 3130, + "time_per_iteration": 2.672825574874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070174, + "balance_loss_mlp": 1.04118252, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.07276264365929157, + "language_loss": 0.82265472, + "learning_rate": 0.00036043863042631465, + "loss": 0.8333565, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.29003906, + "step": 3131, + "time_per_iteration": 2.724228858947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03918386, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.06054022798216566, + "language_loss": 0.76351178, + "learning_rate": 0.00036013949734121133, + "loss": 0.77419853, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.29467773, + "step": 3132, + "time_per_iteration": 3.1145389080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068619, + "balance_loss_mlp": 1.03831553, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.061447218218141524, + "language_loss": 0.82303023, + "learning_rate": 0.00035984041855477043, + "loss": 0.83371639, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.30249023, + "step": 3133, + "time_per_iteration": 2.779906749725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_mlp": 1.01274288, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.015590695702157922, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79734081, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.11425781, + "step": 3134, + "time_per_iteration": 4.933319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064388, + "balance_loss_mlp": 1.03503895, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.05335614021413427, + "language_loss": 0.79509521, + "learning_rate": 0.00035924242434230637, + "loss": 0.80573905, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.29321289, + "step": 3135, + "time_per_iteration": 2.6558902263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065788, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.07899589356076418, + "language_loss": 0.78020877, + "learning_rate": 0.00035894350914844516, + "loss": 0.79086667, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.28881836, + "step": 3136, + "time_per_iteration": 2.631028175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_mlp": 1.03927457, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06724246097152477, + "language_loss": 0.8242653, + "learning_rate": 0.0003586446487175703, + "loss": 0.83495319, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.29516602, + "step": 3137, + "time_per_iteration": 2.6988327503204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03866601, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.053597642089091506, + "language_loss": 0.85091925, + "learning_rate": 0.0003583458431657099, + "loss": 0.86160386, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.29760742, + "step": 3138, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067135, + "balance_loss_mlp": 1.03735673, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.06925518043051447, + "language_loss": 0.83323741, + "learning_rate": 0.00035804709260887056, + "loss": 0.84390879, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.29711914, + "step": 3139, + "time_per_iteration": 2.664776563644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069913, + "balance_loss_mlp": 1.04013443, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.05868516129691736, + "language_loss": 0.894665, + "learning_rate": 0.0003577483971630373, + "loss": 0.90536416, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.29760742, + "step": 3140, + "time_per_iteration": 2.659006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069941, + "balance_loss_mlp": 1.03982854, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0462994946970423, + "language_loss": 0.85074717, + "learning_rate": 0.00035744975694417414, + "loss": 0.86144656, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.30078125, + "step": 3141, + "time_per_iteration": 2.9323952198028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073401, + "balance_loss_mlp": 1.04438555, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.06410322202016926, + "language_loss": 0.82079303, + "learning_rate": 0.00035715117206822344, + "loss": 0.83152711, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.28979492, + "step": 3142, + "time_per_iteration": 2.8329904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070447, + "balance_loss_mlp": 1.04145527, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.060439068049678774, + "language_loss": 0.80993617, + "learning_rate": 0.0003568526426511065, + "loss": 0.82064068, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.28979492, + "step": 3143, + "time_per_iteration": 2.695185899734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_mlp": 1.0432328, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.06755719072358204, + "language_loss": 0.82702982, + "learning_rate": 0.000356554168808722, + "loss": 0.83775228, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.29003906, + "step": 3144, + "time_per_iteration": 2.9742469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04537654, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05422673748867178, + "language_loss": 0.84676063, + "learning_rate": 0.00035625575065694837, + "loss": 0.85749412, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.2800293, + "step": 3145, + "time_per_iteration": 2.8367791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04934883, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05280732268922785, + "language_loss": 0.77452278, + "learning_rate": 0.0003559573883116415, + "loss": 0.78530073, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28466797, + "step": 3146, + "time_per_iteration": 2.701388120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075301, + "balance_loss_mlp": 1.04702449, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.04869973207051341, + "language_loss": 0.85634321, + "learning_rate": 0.00035565908188863604, + "loss": 0.86709619, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.28271484, + "step": 3147, + "time_per_iteration": 2.898590087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076445, + "balance_loss_mlp": 1.04831183, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.06327080100476104, + "language_loss": 0.79599166, + "learning_rate": 0.00035536083150374464, + "loss": 0.80675614, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.28149414, + "step": 3148, + "time_per_iteration": 2.771320343017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_mlp": 1.00905097, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.011512942764516735, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75768542, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.11523438, + "step": 3149, + "time_per_iteration": 4.814287185668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077389, + "balance_loss_mlp": 1.04918396, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.05840631409964381, + "language_loss": 0.85528827, + "learning_rate": 0.0003547644993114475, + "loss": 0.86606216, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.28198242, + "step": 3150, + "time_per_iteration": 2.8378889560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107623, + "balance_loss_mlp": 1.04795372, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.06870733473036895, + "language_loss": 0.7981267, + "learning_rate": 0.00035446641773555806, + "loss": 0.80888903, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.28295898, + "step": 3151, + "time_per_iteration": 2.7372798919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077461, + "balance_loss_mlp": 1.04916036, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.05718786699526154, + "language_loss": 0.86853182, + "learning_rate": 0.000354168392660816, + "loss": 0.87930644, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.28344727, + "step": 3152, + "time_per_iteration": 2.7871758937835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073815, + "balance_loss_mlp": 1.04558635, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05898712641381182, + "language_loss": 0.82702786, + "learning_rate": 0.0003538704242029252, + "loss": 0.83776605, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.28222656, + "step": 3153, + "time_per_iteration": 2.700695753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.0467577, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.06128602508798912, + "language_loss": 0.7773366, + "learning_rate": 0.0003535725124775672, + "loss": 0.78808761, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28344727, + "step": 3154, + "time_per_iteration": 2.8570618629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076573, + "balance_loss_mlp": 1.0478195, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.055885875690184536, + "language_loss": 0.86403567, + "learning_rate": 0.00035327465760040126, + "loss": 0.8748014, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.28710938, + "step": 3155, + "time_per_iteration": 2.6846063137054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04281223, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.06048889768089712, + "language_loss": 0.84499794, + "learning_rate": 0.00035297685968706526, + "loss": 0.85571855, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.29223633, + "step": 3156, + "time_per_iteration": 2.7771387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072214, + "balance_loss_mlp": 1.04453337, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.06250295268242392, + "language_loss": 0.83014715, + "learning_rate": 0.00035267911885317454, + "loss": 0.84086931, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.27709961, + "step": 3157, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074037, + "balance_loss_mlp": 1.0442822, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.057378940891661595, + "language_loss": 0.81611866, + "learning_rate": 0.0003523814352143222, + "loss": 0.826859, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.29711914, + "step": 3158, + "time_per_iteration": 2.830617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04883063, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.0599841254590138, + "language_loss": 0.90816242, + "learning_rate": 0.00035208380888607937, + "loss": 0.91893965, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.28881836, + "step": 3159, + "time_per_iteration": 2.8117706775665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009022, + "balance_loss_mlp": 0.99786437, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.007967889265398313, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80471009, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.11181641, + "step": 3160, + "time_per_iteration": 4.8633644580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009246, + "balance_loss_mlp": 0.998088, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.00797101191785885, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76701474, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.11181641, + "step": 3161, + "time_per_iteration": 5.046196460723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075203, + "balance_loss_mlp": 1.04611611, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.04533613724441275, + "language_loss": 0.81858671, + "learning_rate": 0.00035119127492038446, + "loss": 0.82933867, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.29077148, + "step": 3162, + "time_per_iteration": 2.815852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075143, + "balance_loss_mlp": 1.0469625, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.053216451363019494, + "language_loss": 0.82787645, + "learning_rate": 0.00035089387898984436, + "loss": 0.83862782, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.059666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075751, + "balance_loss_mlp": 1.04683065, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.06412835192713194, + "language_loss": 0.81799018, + "learning_rate": 0.0003505965409474343, + "loss": 0.82874769, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28881836, + "step": 3164, + "time_per_iteration": 2.8909780979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072573, + "balance_loss_mlp": 1.04374802, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.050432732030132946, + "language_loss": 0.86329949, + "learning_rate": 0.0003502992609085913, + "loss": 0.87402523, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.28808594, + "step": 3165, + "time_per_iteration": 2.66687273979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.04513407, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.053888239650619583, + "language_loss": 0.82507217, + "learning_rate": 0.00035000203898872954, + "loss": 0.83581889, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.29516602, + "step": 3166, + "time_per_iteration": 3.05118989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04303908, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.06623841355558525, + "language_loss": 0.84253997, + "learning_rate": 0.0003497048753032406, + "loss": 0.85326171, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.29125977, + "step": 3167, + "time_per_iteration": 2.87467885017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074156, + "balance_loss_mlp": 1.04473543, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.05347521996771115, + "language_loss": 0.80754191, + "learning_rate": 0.000349407769967494, + "loss": 0.81828344, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.29394531, + "step": 3168, + "time_per_iteration": 3.3934104442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074195, + "balance_loss_mlp": 1.04546547, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.10902305889023324, + "language_loss": 0.84663367, + "learning_rate": 0.0003491107230968361, + "loss": 0.85737562, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.28710938, + "step": 3169, + "time_per_iteration": 2.6888718605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_mlp": 1.04351735, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.05661622017927931, + "language_loss": 0.81418574, + "learning_rate": 0.00034881373480659085, + "loss": 0.82490849, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.28735352, + "step": 3170, + "time_per_iteration": 2.820013999938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073009, + "balance_loss_mlp": 1.043993, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.0573564735722831, + "language_loss": 0.78202963, + "learning_rate": 0.0003485168052120594, + "loss": 0.79275972, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.2902832, + "step": 3171, + "time_per_iteration": 2.5298008918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108136, + "balance_loss_mlp": 1.05255914, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.06128596263952344, + "language_loss": 0.79907572, + "learning_rate": 0.00034821993442851973, + "loss": 0.80988932, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.28808594, + "step": 3172, + "time_per_iteration": 2.5819344520568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075018, + "balance_loss_mlp": 1.0474807, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.06156265055034652, + "language_loss": 0.82331789, + "learning_rate": 0.00034792312257122735, + "loss": 0.83406806, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.27612305, + "step": 3173, + "time_per_iteration": 2.621645212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070505, + "balance_loss_mlp": 1.04187059, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.059872220515584544, + "language_loss": 0.80486125, + "learning_rate": 0.00034762636975541506, + "loss": 0.8155663, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.28613281, + "step": 3174, + "time_per_iteration": 2.6323647499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074186, + "balance_loss_mlp": 1.0451467, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.05798479282712576, + "language_loss": 0.81059682, + "learning_rate": 0.0003473296760962923, + "loss": 0.82133865, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2902832, + "step": 3175, + "time_per_iteration": 2.679593324661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018524, + "balance_loss_mlp": 1.007128, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.01318817873369303, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79552263, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.11376953, + "step": 3176, + "time_per_iteration": 4.708170652389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075937, + "balance_loss_mlp": 1.04811323, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.06988374073618883, + "language_loss": 0.81172955, + "learning_rate": 0.00034673646670883976, + "loss": 0.82248896, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.27832031, + "step": 3177, + "time_per_iteration": 3.0760982036590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018443, + "balance_loss_mlp": 1.00714159, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.012123406085696703, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76733464, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.11279297, + "step": 3178, + "time_per_iteration": 5.047900199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.04909086, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.06496983177026339, + "language_loss": 0.81433582, + "learning_rate": 0.0003461434953300865, + "loss": 0.82510948, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.28271484, + "step": 3179, + "time_per_iteration": 2.934129476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.0462321, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.054564857541299305, + "language_loss": 0.81309831, + "learning_rate": 0.0003458470991817515, + "loss": 0.82384884, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.28808594, + "step": 3180, + "time_per_iteration": 2.9692420959472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.05249786, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056066758208496104, + "language_loss": 0.84904051, + "learning_rate": 0.0003455507628808802, + "loss": 0.85985035, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.28491211, + "step": 3181, + "time_per_iteration": 2.613642692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107824, + "balance_loss_mlp": 1.04986787, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.07624020954576015, + "language_loss": 0.84440458, + "learning_rate": 0.00034525448654252076, + "loss": 0.855187, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.28369141, + "step": 3182, + "time_per_iteration": 2.6653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074575, + "balance_loss_mlp": 1.04701424, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.06355946830094689, + "language_loss": 0.82891977, + "learning_rate": 0.0003449582702816976, + "loss": 0.83966547, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.27587891, + "step": 3183, + "time_per_iteration": 2.6951351165771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05404711, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.056298205322627685, + "language_loss": 0.82360494, + "learning_rate": 0.0003446621142134122, + "loss": 0.83442801, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.28271484, + "step": 3184, + "time_per_iteration": 2.6690409183502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077624, + "balance_loss_mlp": 1.04958582, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.06604074574998081, + "language_loss": 0.84192419, + "learning_rate": 0.0003443660184526424, + "loss": 0.85270047, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.28051758, + "step": 3185, + "time_per_iteration": 2.4451961517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078036, + "balance_loss_mlp": 1.04949737, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.0548279179658957, + "language_loss": 0.86286807, + "learning_rate": 0.0003440699831143429, + "loss": 0.87364841, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.28515625, + "step": 3186, + "time_per_iteration": 2.7583630084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04989386, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05592702907616355, + "language_loss": 0.81846583, + "learning_rate": 0.0003437740083134449, + "loss": 0.82924777, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.28344727, + "step": 3187, + "time_per_iteration": 2.6769111156463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107819, + "balance_loss_mlp": 1.05053306, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.07534478934925966, + "language_loss": 0.82936466, + "learning_rate": 0.00034347809416485574, + "loss": 0.84014654, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27709961, + "step": 3188, + "time_per_iteration": 2.579110622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05052042, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.05208625136089098, + "language_loss": 0.8201586, + "learning_rate": 0.0003431822407834597, + "loss": 0.83094943, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.28588867, + "step": 3189, + "time_per_iteration": 2.800846815109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.05084062, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.06054576051189374, + "language_loss": 0.84436607, + "learning_rate": 0.00034288644828411706, + "loss": 0.85516232, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.28735352, + "step": 3190, + "time_per_iteration": 3.459338426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05513883, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.0818478077901872, + "language_loss": 0.75477004, + "learning_rate": 0.0003425907167816649, + "loss": 0.7656135, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.29150391, + "step": 3191, + "time_per_iteration": 2.874662399291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.05148816, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.06137447834473829, + "language_loss": 0.84648186, + "learning_rate": 0.00034229504639091623, + "loss": 0.85728073, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.28393555, + "step": 3192, + "time_per_iteration": 2.768174171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078463, + "balance_loss_mlp": 1.04906654, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.05748161960079173, + "language_loss": 0.80287862, + "learning_rate": 0.0003419994372266606, + "loss": 0.81366324, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.29345703, + "step": 3193, + "time_per_iteration": 3.1592228412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05054975, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04575030988697244, + "language_loss": 0.81596744, + "learning_rate": 0.00034170388940366335, + "loss": 0.82676071, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.2878418, + "step": 3194, + "time_per_iteration": 2.707101345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078794, + "balance_loss_mlp": 1.05011201, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05557650302359453, + "language_loss": 0.79986775, + "learning_rate": 0.0003414084030366667, + "loss": 0.81065571, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.28686523, + "step": 3195, + "time_per_iteration": 3.086768388748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04118395, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05715110105949097, + "language_loss": 0.82949638, + "learning_rate": 0.0003411129782403883, + "loss": 0.84020627, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.29760742, + "step": 3196, + "time_per_iteration": 2.65775203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078782, + "balance_loss_mlp": 1.04926562, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.06094401033818373, + "language_loss": 0.8473599, + "learning_rate": 0.0003408176151295225, + "loss": 0.8581478, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.29516602, + "step": 3197, + "time_per_iteration": 2.6118876934051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076412, + "balance_loss_mlp": 1.04806376, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.056153389528983695, + "language_loss": 0.7719816, + "learning_rate": 0.00034052231381873944, + "loss": 0.78274572, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.28320312, + "step": 3198, + "time_per_iteration": 2.6228411197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05066109, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.07032084774443613, + "language_loss": 0.84981108, + "learning_rate": 0.00034022707442268494, + "loss": 0.86060715, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.28955078, + "step": 3199, + "time_per_iteration": 2.6281561851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.05204892, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04792292414356855, + "language_loss": 0.81849301, + "learning_rate": 0.0003399318970559813, + "loss": 0.82930362, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.28979492, + "step": 3200, + "time_per_iteration": 2.848755121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083137, + "balance_loss_mlp": 1.05426502, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.06290240151644533, + "language_loss": 0.8428275, + "learning_rate": 0.00033963678183322656, + "loss": 0.85365885, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.28833008, + "step": 3201, + "time_per_iteration": 3.027029275894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083363, + "balance_loss_mlp": 1.05396593, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.050860435501305326, + "language_loss": 0.8262167, + "learning_rate": 0.0003393417288689945, + "loss": 0.83705032, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.29370117, + "step": 3202, + "time_per_iteration": 2.6697185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05422282, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.07354923140459588, + "language_loss": 0.75762349, + "learning_rate": 0.00033904673827783504, + "loss": 0.76847088, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.3046875, + "step": 3203, + "time_per_iteration": 2.9294135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083321, + "balance_loss_mlp": 1.05423403, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.060707114262551334, + "language_loss": 0.8162061, + "learning_rate": 0.00033875181017427357, + "loss": 0.82703936, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.2902832, + "step": 3204, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078594, + "balance_loss_mlp": 1.04924512, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.054344968838841615, + "language_loss": 0.80957687, + "learning_rate": 0.00033845694467281133, + "loss": 0.82036287, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.29321289, + "step": 3205, + "time_per_iteration": 2.846841812133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.0531013, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.06726799818780427, + "language_loss": 0.83033085, + "learning_rate": 0.00033816214188792516, + "loss": 0.84114861, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.28686523, + "step": 3206, + "time_per_iteration": 3.1646995544433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_mlp": 1.05008507, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.05376278097292006, + "language_loss": 0.8520205, + "learning_rate": 0.00033786740193406784, + "loss": 0.86280841, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.28686523, + "step": 3207, + "time_per_iteration": 2.577228307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075976, + "balance_loss_mlp": 1.04767549, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.056191099229546404, + "language_loss": 0.81319952, + "learning_rate": 0.00033757272492566736, + "loss": 0.82395929, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.28320312, + "step": 3208, + "time_per_iteration": 2.8721108436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078583, + "balance_loss_mlp": 1.05013978, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.04893199519437597, + "language_loss": 0.87034678, + "learning_rate": 0.0003372781109771278, + "loss": 0.8811326, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.28442383, + "step": 3209, + "time_per_iteration": 2.7287070751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077966, + "balance_loss_mlp": 1.04907, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.04879640412841063, + "language_loss": 0.76108795, + "learning_rate": 0.0003369835602028281, + "loss": 0.77186757, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.28881836, + "step": 3210, + "time_per_iteration": 2.8439886569976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04924726, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.055192186653408186, + "language_loss": 0.79211128, + "learning_rate": 0.0003366890727171232, + "loss": 0.80289745, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.29345703, + "step": 3211, + "time_per_iteration": 2.6932919025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.0535692, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.07153817197124837, + "language_loss": 0.78408551, + "learning_rate": 0.00033639464863434313, + "loss": 0.79490948, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.2878418, + "step": 3212, + "time_per_iteration": 2.6900713443756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_mlp": 1.02929533, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.01617816391785494, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79482591, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.10839844, + "step": 3213, + "time_per_iteration": 4.7103211879730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077859, + "balance_loss_mlp": 1.04979765, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.0586976807946241, + "language_loss": 0.79730934, + "learning_rate": 0.00033580599113475543, + "loss": 0.80808794, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.28076172, + "step": 3214, + "time_per_iteration": 2.972890853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076015, + "balance_loss_mlp": 1.04759574, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.06601952737269029, + "language_loss": 0.85816491, + "learning_rate": 0.00033551175794648507, + "loss": 0.86892509, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.28417969, + "step": 3215, + "time_per_iteration": 2.456907033920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.04439735, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.062254504168561625, + "language_loss": 0.8188296, + "learning_rate": 0.00033521758861821365, + "loss": 0.82955682, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.28344727, + "step": 3216, + "time_per_iteration": 2.580777406692505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_mlp": 1.04071391, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.04883960048827372, + "language_loss": 0.88878882, + "learning_rate": 0.0003349234832641479, + "loss": 0.89947987, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.28417969, + "step": 3217, + "time_per_iteration": 2.5541629791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074942, + "balance_loss_mlp": 1.04635608, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.06561076665766134, + "language_loss": 0.80879915, + "learning_rate": 0.00033462944199846975, + "loss": 0.81954861, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.28540039, + "step": 3218, + "time_per_iteration": 3.062703847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077215, + "balance_loss_mlp": 1.04848528, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.06502548187197098, + "language_loss": 0.8618629, + "learning_rate": 0.00033433546493533606, + "loss": 0.87263501, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.28710938, + "step": 3219, + "time_per_iteration": 2.4797823429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072308, + "balance_loss_mlp": 1.04443645, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.06173556799123847, + "language_loss": 0.840487, + "learning_rate": 0.00033404155218887897, + "loss": 0.85121012, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27880859, + "step": 3220, + "time_per_iteration": 2.7182207107543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04733968, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.08803961295836986, + "language_loss": 0.87216806, + "learning_rate": 0.00033374770387320534, + "loss": 0.88291949, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.27856445, + "step": 3221, + "time_per_iteration": 2.7941346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078924, + "balance_loss_mlp": 1.05095768, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.055815039151530264, + "language_loss": 0.84867358, + "learning_rate": 0.00033345392010239737, + "loss": 0.8594628, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.27978516, + "step": 3222, + "time_per_iteration": 2.710803747177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082482, + "balance_loss_mlp": 1.05432487, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05804972472550271, + "language_loss": 0.82259816, + "learning_rate": 0.0003331602009905118, + "loss": 0.83342302, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.28198242, + "step": 3223, + "time_per_iteration": 2.8335556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081003, + "balance_loss_mlp": 1.052917, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.05452675895151675, + "language_loss": 0.83620667, + "learning_rate": 0.00033286654665158085, + "loss": 0.84701669, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.28100586, + "step": 3224, + "time_per_iteration": 2.929290533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.05038977, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.05879630449885449, + "language_loss": 0.87538344, + "learning_rate": 0.0003325729571996109, + "loss": 0.88616055, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.27368164, + "step": 3225, + "time_per_iteration": 2.6219499111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_mlp": 1.04980159, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.06449737595715416, + "language_loss": 0.83818585, + "learning_rate": 0.000332279432748584, + "loss": 0.84897381, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.28955078, + "step": 3226, + "time_per_iteration": 2.7298083305358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082841, + "balance_loss_mlp": 1.054636, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.05904408165059124, + "language_loss": 0.87270737, + "learning_rate": 0.00033198597341245576, + "loss": 0.88353574, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.28222656, + "step": 3227, + "time_per_iteration": 2.5691256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108032, + "balance_loss_mlp": 1.05151939, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.053113519370634896, + "language_loss": 0.81682974, + "learning_rate": 0.00033169257930515763, + "loss": 0.82763296, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.2878418, + "step": 3228, + "time_per_iteration": 3.0353121757507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05587709, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.059839903219207714, + "language_loss": 0.82242584, + "learning_rate": 0.0003313992505405951, + "loss": 0.83327174, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.28686523, + "step": 3229, + "time_per_iteration": 2.720705270767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04743469, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.0642388463301134, + "language_loss": 0.80858111, + "learning_rate": 0.0003311059872326487, + "loss": 0.81933248, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27709961, + "step": 3230, + "time_per_iteration": 2.6720995903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.05352879, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.049445896607163295, + "language_loss": 0.78987181, + "learning_rate": 0.0003308127894951734, + "loss": 0.80068845, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.28149414, + "step": 3231, + "time_per_iteration": 2.63030743598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107994, + "balance_loss_mlp": 1.05214071, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.07248200651444572, + "language_loss": 0.86507577, + "learning_rate": 0.00033051965744199834, + "loss": 0.87587512, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27832031, + "step": 3232, + "time_per_iteration": 2.7564406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05302238, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.05351658478199456, + "language_loss": 0.90184295, + "learning_rate": 0.0003302265911869276, + "loss": 0.91265333, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.28051758, + "step": 3233, + "time_per_iteration": 2.9271633625030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081705, + "balance_loss_mlp": 1.05373812, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.056002159029406404, + "language_loss": 0.84084082, + "learning_rate": 0.0003299335908437397, + "loss": 0.85165787, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.2800293, + "step": 3234, + "time_per_iteration": 2.5909643173217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080844, + "balance_loss_mlp": 1.05228114, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.06942928938800572, + "language_loss": 0.79645211, + "learning_rate": 0.0003296406565261873, + "loss": 0.80726051, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.28564453, + "step": 3235, + "time_per_iteration": 2.5319809913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107822, + "balance_loss_mlp": 1.04927599, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04882824212942084, + "language_loss": 0.8475616, + "learning_rate": 0.0003293477883479978, + "loss": 0.85834384, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.28955078, + "step": 3236, + "time_per_iteration": 2.8348751068115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079457, + "balance_loss_mlp": 1.05110943, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06517457110491971, + "language_loss": 0.79784298, + "learning_rate": 0.0003290549864228727, + "loss": 0.80863756, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.28369141, + "step": 3237, + "time_per_iteration": 2.9205360412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078288, + "balance_loss_mlp": 1.04934406, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.05190818630751583, + "language_loss": 0.86413801, + "learning_rate": 0.0003287622508644875, + "loss": 0.8749209, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.28930664, + "step": 3238, + "time_per_iteration": 2.7504210472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04736114, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.06410601543922713, + "language_loss": 0.8596704, + "learning_rate": 0.0003284695817864923, + "loss": 0.8704325, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.28808594, + "step": 3239, + "time_per_iteration": 2.487185001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.0541544, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.07028564715864687, + "language_loss": 0.83921337, + "learning_rate": 0.0003281769793025116, + "loss": 0.85003626, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28149414, + "step": 3240, + "time_per_iteration": 2.7399847507476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107903, + "balance_loss_mlp": 1.05106378, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.06749958965512537, + "language_loss": 0.89295518, + "learning_rate": 0.00032788444352614346, + "loss": 0.90374541, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.27978516, + "step": 3241, + "time_per_iteration": 2.550497531890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.05055451, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05896628136636162, + "language_loss": 0.80561244, + "learning_rate": 0.0003275919745709606, + "loss": 0.81640697, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.28881836, + "step": 3242, + "time_per_iteration": 2.5805697441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107483, + "balance_loss_mlp": 1.0460763, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.058276556279693525, + "language_loss": 0.8216207, + "learning_rate": 0.00032729957255050936, + "loss": 0.83236909, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.28759766, + "step": 3243, + "time_per_iteration": 2.6520867347717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075457, + "balance_loss_mlp": 1.0462271, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.0677841364318074, + "language_loss": 0.81232285, + "learning_rate": 0.0003270072375783102, + "loss": 0.82307744, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.29174805, + "step": 3244, + "time_per_iteration": 2.8922722339630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079597, + "balance_loss_mlp": 1.05098701, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.055818323982708785, + "language_loss": 0.7931875, + "learning_rate": 0.00032671496976785774, + "loss": 0.80398345, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.28613281, + "step": 3245, + "time_per_iteration": 2.6470372676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04359281, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04960718098470409, + "language_loss": 0.75533414, + "learning_rate": 0.0003264227692326205, + "loss": 0.76605284, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.28295898, + "step": 3246, + "time_per_iteration": 3.0302975177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079718, + "balance_loss_mlp": 1.05010653, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.054579168692914876, + "language_loss": 0.85738158, + "learning_rate": 0.00032613063608604055, + "loss": 0.86817873, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.29589844, + "step": 3247, + "time_per_iteration": 2.529571771621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_mlp": 1.05147064, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.054889772992989326, + "language_loss": 0.8363654, + "learning_rate": 0.0003258385704415343, + "loss": 0.84716547, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.28540039, + "step": 3248, + "time_per_iteration": 2.590259313583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076974, + "balance_loss_mlp": 1.04745758, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.0554200225727057, + "language_loss": 0.82566541, + "learning_rate": 0.0003255465724124915, + "loss": 0.8364352, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.29492188, + "step": 3249, + "time_per_iteration": 2.6928865909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.05044842, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.051820175568143126, + "language_loss": 0.82984078, + "learning_rate": 0.00032525464211227587, + "loss": 0.84063572, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2902832, + "step": 3250, + "time_per_iteration": 2.5911831855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.0519259, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05767056492483943, + "language_loss": 0.85669184, + "learning_rate": 0.0003249627796542249, + "loss": 0.86749554, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.28442383, + "step": 3251, + "time_per_iteration": 2.6558287143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_mlp": 1.04481697, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.0553994194583659, + "language_loss": 0.84238529, + "learning_rate": 0.00032467098515164943, + "loss": 0.85312456, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.29077148, + "step": 3252, + "time_per_iteration": 2.8710081577301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010798, + "balance_loss_mlp": 1.04992628, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.0724295756751151, + "language_loss": 0.83990276, + "learning_rate": 0.00032437925871783456, + "loss": 0.85070074, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.2980957, + "step": 3253, + "time_per_iteration": 2.680757761001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077647, + "balance_loss_mlp": 1.04824996, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06297548912406484, + "language_loss": 0.84215987, + "learning_rate": 0.00032408760046603803, + "loss": 0.85293639, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.29370117, + "step": 3254, + "time_per_iteration": 2.8605175018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.04308379, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.06707664571923276, + "language_loss": 0.77650177, + "learning_rate": 0.00032379601050949193, + "loss": 0.78721887, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.28613281, + "step": 3255, + "time_per_iteration": 3.0878231525421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107032, + "balance_loss_mlp": 1.04125643, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.055802614278498724, + "language_loss": 0.8790136, + "learning_rate": 0.0003235044889614013, + "loss": 0.8897168, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.29052734, + "step": 3256, + "time_per_iteration": 2.5939788818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.04302788, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.05515134857427489, + "language_loss": 0.83577603, + "learning_rate": 0.0003232130359349451, + "loss": 0.84650195, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.29541016, + "step": 3257, + "time_per_iteration": 2.8894662857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_mlp": 1.03752887, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.05130373708668117, + "language_loss": 0.81576669, + "learning_rate": 0.0003229216515432751, + "loss": 0.82644784, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.30566406, + "step": 3258, + "time_per_iteration": 2.756706476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.04586434, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.06660247735864482, + "language_loss": 0.79725903, + "learning_rate": 0.0003226303358995174, + "loss": 0.80802286, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.3046875, + "step": 3259, + "time_per_iteration": 2.67144775390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077975, + "balance_loss_mlp": 1.04760051, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.05404958184745656, + "language_loss": 0.88993442, + "learning_rate": 0.00032233908911677, + "loss": 0.90071416, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.30322266, + "step": 3260, + "time_per_iteration": 2.863938808441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073635, + "balance_loss_mlp": 1.0435462, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.053449532753106085, + "language_loss": 0.80614489, + "learning_rate": 0.0003220479113081053, + "loss": 0.81688124, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.30053711, + "step": 3261, + "time_per_iteration": 2.7604382038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106913, + "balance_loss_mlp": 1.03846908, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.08212493062436176, + "language_loss": 0.78586102, + "learning_rate": 0.00032175680258656836, + "loss": 0.7965523, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.30615234, + "step": 3262, + "time_per_iteration": 2.6967196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071974, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.05356215085141381, + "language_loss": 0.79812634, + "learning_rate": 0.00032146576306517794, + "loss": 0.80884606, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.30029297, + "step": 3263, + "time_per_iteration": 2.8093175888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078395, + "balance_loss_mlp": 1.04866421, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.0554541143403023, + "language_loss": 0.80460787, + "learning_rate": 0.0003211747928569255, + "loss": 0.81539178, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.296875, + "step": 3264, + "time_per_iteration": 2.760589122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076242, + "balance_loss_mlp": 1.04741764, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.05014640017162604, + "language_loss": 0.81306803, + "learning_rate": 0.0003208838920747754, + "loss": 0.82383049, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.28833008, + "step": 3265, + "time_per_iteration": 2.8798112869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072039, + "balance_loss_mlp": 1.04342878, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.0653184175681376, + "language_loss": 0.7620573, + "learning_rate": 0.0003205930608316656, + "loss": 0.77277768, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.28588867, + "step": 3266, + "time_per_iteration": 3.571838140487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03900564, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.0645756575705021, + "language_loss": 0.84763867, + "learning_rate": 0.00032030229924050673, + "loss": 0.85831082, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.2824707, + "step": 3267, + "time_per_iteration": 2.6483044624328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076916, + "balance_loss_mlp": 1.04732847, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.056929311189361634, + "language_loss": 0.79781055, + "learning_rate": 0.00032001160741418247, + "loss": 0.8085798, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.2956543, + "step": 3268, + "time_per_iteration": 2.6264944076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04559875, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06099991776651708, + "language_loss": 0.82100242, + "learning_rate": 0.0003197209854655494, + "loss": 0.83175737, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.29833984, + "step": 3269, + "time_per_iteration": 2.704279661178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_mlp": 1.04454803, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.06377784920568129, + "language_loss": 0.74516416, + "learning_rate": 0.0003194304335074371, + "loss": 0.75589478, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.28515625, + "step": 3270, + "time_per_iteration": 2.82635235786438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04281116, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.054968431789037576, + "language_loss": 0.88535178, + "learning_rate": 0.0003191399516526475, + "loss": 0.89607286, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.29272461, + "step": 3271, + "time_per_iteration": 2.4927825927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074321, + "balance_loss_mlp": 1.04575849, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05221826851343204, + "language_loss": 0.79470003, + "learning_rate": 0.0003188495400139559, + "loss": 0.80544329, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.28540039, + "step": 3272, + "time_per_iteration": 2.764953851699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071949, + "balance_loss_mlp": 1.04312468, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.060799032420417454, + "language_loss": 0.84558678, + "learning_rate": 0.00031855919870411013, + "loss": 0.85630625, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.28808594, + "step": 3273, + "time_per_iteration": 2.823537588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071632, + "balance_loss_mlp": 1.04213953, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05430009118151755, + "language_loss": 0.84791374, + "learning_rate": 0.0003182689278358305, + "loss": 0.85863006, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.29443359, + "step": 3274, + "time_per_iteration": 2.6649551391601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073347, + "balance_loss_mlp": 1.04416466, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.085227141064307, + "language_loss": 0.79910004, + "learning_rate": 0.0003179787275218105, + "loss": 0.80983347, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.29174805, + "step": 3275, + "time_per_iteration": 2.563103437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074447, + "balance_loss_mlp": 1.0460037, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.07197275527111574, + "language_loss": 0.84121722, + "learning_rate": 0.0003176885978747155, + "loss": 0.85196167, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.28466797, + "step": 3276, + "time_per_iteration": 2.634556293487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076833, + "balance_loss_mlp": 1.04807937, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.05534578709936448, + "language_loss": 0.82750475, + "learning_rate": 0.0003173985390071839, + "loss": 0.83827305, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.28735352, + "step": 3277, + "time_per_iteration": 2.8998594284057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018796, + "balance_loss_mlp": 1.0069232, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.01138839518784329, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78919256, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11865234, + "step": 3278, + "time_per_iteration": 4.791780233383179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076998, + "balance_loss_mlp": 1.04678988, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.07347882473000023, + "language_loss": 0.81146979, + "learning_rate": 0.00031681863406122704, + "loss": 0.82223976, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.30151367, + "step": 3279, + "time_per_iteration": 2.7681593894958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077607, + "balance_loss_mlp": 1.0484724, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.0604928742924753, + "language_loss": 0.85127562, + "learning_rate": 0.00031652878820794087, + "loss": 0.86205173, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.29101562, + "step": 3280, + "time_per_iteration": 2.9940550327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_mlp": 1.04970694, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.06373938844251871, + "language_loss": 0.85768282, + "learning_rate": 0.00031623901358449627, + "loss": 0.86847264, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.29223633, + "step": 3281, + "time_per_iteration": 2.637016773223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080918, + "balance_loss_mlp": 1.05206895, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.0651224667912018, + "language_loss": 0.88407606, + "learning_rate": 0.0003159493103033936, + "loss": 0.89488524, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.28857422, + "step": 3282, + "time_per_iteration": 2.6074159145355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_mlp": 1.0136919, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.014583316572648261, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80944717, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.11962891, + "step": 3283, + "time_per_iteration": 4.897862195968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05183721, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.07926250214207341, + "language_loss": 0.82117367, + "learning_rate": 0.0003153701182180776, + "loss": 0.83198726, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.29443359, + "step": 3284, + "time_per_iteration": 2.773768186569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05653346, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.06299610541065176, + "language_loss": 0.81832671, + "learning_rate": 0.00031508062963872655, + "loss": 0.82917833, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.28613281, + "step": 3285, + "time_per_iteration": 2.5745344161987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_mlp": 1.05484831, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.0675003916655452, + "language_loss": 0.7940349, + "learning_rate": 0.0003147912128514423, + "loss": 0.80487257, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2890625, + "step": 3286, + "time_per_iteration": 2.736119508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088711, + "balance_loss_mlp": 1.05976713, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.055334521213686955, + "language_loss": 0.87346876, + "learning_rate": 0.0003145018679685859, + "loss": 0.88435584, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.28881836, + "step": 3287, + "time_per_iteration": 2.747880697250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_mlp": 1.05515993, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.049981399044418943, + "language_loss": 0.8773675, + "learning_rate": 0.00031421259510249134, + "loss": 0.88820541, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.28637695, + "step": 3288, + "time_per_iteration": 2.828601121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087286, + "balance_loss_mlp": 1.05898595, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05983667283250032, + "language_loss": 0.81054246, + "learning_rate": 0.00031392339436546414, + "loss": 0.82141531, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.28295898, + "step": 3289, + "time_per_iteration": 2.8950355052948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05599856, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.08046321176630551, + "language_loss": 0.83522916, + "learning_rate": 0.00031363426586978205, + "loss": 0.84606409, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27539062, + "step": 3290, + "time_per_iteration": 2.842975378036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079426, + "balance_loss_mlp": 1.05234218, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.06320614545402135, + "language_loss": 0.84556788, + "learning_rate": 0.0003133452097276947, + "loss": 0.85636216, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.27148438, + "step": 3291, + "time_per_iteration": 2.7399022579193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079638, + "balance_loss_mlp": 1.05174291, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.05133484594344534, + "language_loss": 0.83828831, + "learning_rate": 0.0003130562260514238, + "loss": 0.84908473, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.27929688, + "step": 3292, + "time_per_iteration": 2.782712936401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04538822, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.05681875015952551, + "language_loss": 0.81639814, + "learning_rate": 0.0003127673149531626, + "loss": 0.82714117, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.28881836, + "step": 3293, + "time_per_iteration": 2.8035476207733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04454613, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.24840448660881664, + "language_loss": 0.82970059, + "learning_rate": 0.0003124784765450762, + "loss": 0.84042978, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.28393555, + "step": 3294, + "time_per_iteration": 2.608938694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077527, + "balance_loss_mlp": 1.04877377, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.05797118879251517, + "language_loss": 0.80332613, + "learning_rate": 0.0003121897109393017, + "loss": 0.81410146, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.28759766, + "step": 3295, + "time_per_iteration": 2.806485414505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075453, + "balance_loss_mlp": 1.04710555, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.05731717325491985, + "language_loss": 0.89463425, + "learning_rate": 0.0003119010182479481, + "loss": 0.90538877, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.28344727, + "step": 3296, + "time_per_iteration": 2.6082053184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.04430485, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05711828874106615, + "language_loss": 0.82742012, + "learning_rate": 0.00031161239858309563, + "loss": 0.8381443, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.28149414, + "step": 3297, + "time_per_iteration": 2.563567638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076965, + "balance_loss_mlp": 1.04818797, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.06150807271743663, + "language_loss": 0.8330332, + "learning_rate": 0.0003113238520567964, + "loss": 0.84380281, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.28759766, + "step": 3298, + "time_per_iteration": 2.6396591663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04760718, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.06211731206435071, + "language_loss": 0.81525218, + "learning_rate": 0.00031103537878107403, + "loss": 0.8260048, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.27709961, + "step": 3299, + "time_per_iteration": 2.7182040214538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076081, + "balance_loss_mlp": 1.04813862, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.09008856802474977, + "language_loss": 0.80391061, + "learning_rate": 0.0003107469788679238, + "loss": 0.81467146, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.27978516, + "step": 3300, + "time_per_iteration": 2.7851805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075354, + "balance_loss_mlp": 1.04688656, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05422740840370266, + "language_loss": 0.86501485, + "learning_rate": 0.00031045865242931267, + "loss": 0.87576842, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.28466797, + "step": 3301, + "time_per_iteration": 2.810676097869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04755139, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.05423287831049679, + "language_loss": 0.82804501, + "learning_rate": 0.00031017039957717877, + "loss": 0.83880234, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.28149414, + "step": 3302, + "time_per_iteration": 3.0281054973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_mlp": 1.0450151, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.05349883160058106, + "language_loss": 0.88460255, + "learning_rate": 0.0003098822204234318, + "loss": 0.89534903, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.29589844, + "step": 3303, + "time_per_iteration": 2.666997194290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075345, + "balance_loss_mlp": 1.04713964, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.06555082687836872, + "language_loss": 0.87261242, + "learning_rate": 0.00030959411507995273, + "loss": 0.88336587, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.2824707, + "step": 3304, + "time_per_iteration": 3.197598457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_mlp": 1.04662395, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.0641703169727953, + "language_loss": 0.81063581, + "learning_rate": 0.00030930608365859407, + "loss": 0.82138741, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.28540039, + "step": 3305, + "time_per_iteration": 2.6621649265289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.04713345, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.049948399084256474, + "language_loss": 0.87610269, + "learning_rate": 0.00030901812627117943, + "loss": 0.88685155, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.27783203, + "step": 3306, + "time_per_iteration": 2.612919807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077235, + "balance_loss_mlp": 1.04826725, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06317558416619916, + "language_loss": 0.84607321, + "learning_rate": 0.000308730243029504, + "loss": 0.85684562, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.28955078, + "step": 3307, + "time_per_iteration": 2.5705294609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072567, + "balance_loss_mlp": 1.04307485, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.05685632301598214, + "language_loss": 0.79783237, + "learning_rate": 0.0003084424340453339, + "loss": 0.80855805, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.29443359, + "step": 3308, + "time_per_iteration": 2.807271957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04784167, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.05758668896734757, + "language_loss": 0.81629676, + "learning_rate": 0.0003081546994304064, + "loss": 0.82706171, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28637695, + "step": 3309, + "time_per_iteration": 2.7554562091827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076484, + "balance_loss_mlp": 1.04794574, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.06449450681570038, + "language_loss": 0.81813806, + "learning_rate": 0.0003078670392964298, + "loss": 0.82890296, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.28540039, + "step": 3310, + "time_per_iteration": 2.5969130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075995, + "balance_loss_mlp": 1.04721737, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05473972875900602, + "language_loss": 0.82840186, + "learning_rate": 0.00030757945375508406, + "loss": 0.83916187, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.28759766, + "step": 3311, + "time_per_iteration": 2.663797616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04507077, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.0598003061946429, + "language_loss": 0.8103205, + "learning_rate": 0.00030729194291801944, + "loss": 0.82106709, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.2956543, + "step": 3312, + "time_per_iteration": 2.6541266441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070179, + "balance_loss_mlp": 1.04099667, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.06742420261969287, + "language_loss": 0.77177984, + "learning_rate": 0.00030700450689685787, + "loss": 0.78248155, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.29174805, + "step": 3313, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071745, + "balance_loss_mlp": 1.0428009, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.04829069116986981, + "language_loss": 0.85252231, + "learning_rate": 0.00030671714580319186, + "loss": 0.86323977, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.28930664, + "step": 3314, + "time_per_iteration": 2.840120553970337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04618776, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.06110269335032462, + "language_loss": 0.83013022, + "learning_rate": 0.0003064298597485846, + "loss": 0.84088534, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.29296875, + "step": 3315, + "time_per_iteration": 2.852611541748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.04463601, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.058531862616109036, + "language_loss": 0.83941239, + "learning_rate": 0.00030614264884457054, + "loss": 0.85014582, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.28710938, + "step": 3316, + "time_per_iteration": 2.636786699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04429102, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.06311790142040714, + "language_loss": 0.7747215, + "learning_rate": 0.000305855513202655, + "loss": 0.78545475, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.2902832, + "step": 3317, + "time_per_iteration": 2.572878837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073954, + "balance_loss_mlp": 1.04491496, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.06648512772878035, + "language_loss": 0.77336514, + "learning_rate": 0.0003055684529343138, + "loss": 0.7841047, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.29052734, + "step": 3318, + "time_per_iteration": 2.4436564445495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072959, + "balance_loss_mlp": 1.04427767, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.17585576995025723, + "language_loss": 0.78666025, + "learning_rate": 0.00030528146815099374, + "loss": 0.79738986, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.28686523, + "step": 3319, + "time_per_iteration": 2.633169174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04463935, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.05914219973016666, + "language_loss": 0.72023094, + "learning_rate": 0.00030499455896411203, + "loss": 0.73096609, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.28881836, + "step": 3320, + "time_per_iteration": 2.6515796184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064633, + "balance_loss_mlp": 1.05213952, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.030989551650608328, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77365446, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.125, + "step": 3321, + "time_per_iteration": 4.949177980422974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.04768264, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.05124764901012802, + "language_loss": 0.76538706, + "learning_rate": 0.0003044209678251865, + "loss": 0.77615809, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.29370117, + "step": 3322, + "time_per_iteration": 2.8691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082016, + "balance_loss_mlp": 1.05257154, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.052110264896392484, + "language_loss": 0.84702694, + "learning_rate": 0.0003041342860958306, + "loss": 0.85784709, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.29443359, + "step": 3323, + "time_per_iteration": 2.764293670654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080288, + "balance_loss_mlp": 1.0508672, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.06415760622420662, + "language_loss": 0.91791111, + "learning_rate": 0.00030384768040828857, + "loss": 0.92871398, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.29418945, + "step": 3324, + "time_per_iteration": 2.676239252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_mlp": 1.05457401, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.06537046066409105, + "language_loss": 0.85248572, + "learning_rate": 0.00030356115087383094, + "loss": 0.86332518, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.29321289, + "step": 3325, + "time_per_iteration": 2.6422836780548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108456, + "balance_loss_mlp": 1.05523491, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.07261726527326764, + "language_loss": 0.85094643, + "learning_rate": 0.00030327469760369803, + "loss": 0.86179203, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.29345703, + "step": 3326, + "time_per_iteration": 2.618764877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078424, + "balance_loss_mlp": 1.04943204, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.06406701351791282, + "language_loss": 0.85019833, + "learning_rate": 0.0003029883207091009, + "loss": 0.86098254, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.28979492, + "step": 3327, + "time_per_iteration": 2.699650764465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04961252, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.0560194788269582, + "language_loss": 0.77876812, + "learning_rate": 0.00030270202030122095, + "loss": 0.78955448, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.29003906, + "step": 3328, + "time_per_iteration": 2.6756327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079179, + "balance_loss_mlp": 1.04994857, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.07533630521216038, + "language_loss": 0.86165637, + "learning_rate": 0.00030241579649121, + "loss": 0.87244821, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.29199219, + "step": 3329, + "time_per_iteration": 2.988523244857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081549, + "balance_loss_mlp": 1.05286741, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.06215732096136448, + "language_loss": 0.79335475, + "learning_rate": 0.00030212964939018994, + "loss": 0.80417025, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.28662109, + "step": 3330, + "time_per_iteration": 2.536287307739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079251, + "balance_loss_mlp": 1.05035472, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.05674161193515711, + "language_loss": 0.85566485, + "learning_rate": 0.0003018435791092527, + "loss": 0.86645734, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.28857422, + "step": 3331, + "time_per_iteration": 2.4944264888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.05191207, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.05931339185061419, + "language_loss": 0.80892223, + "learning_rate": 0.00030155758575946083, + "loss": 0.81972075, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.27954102, + "step": 3332, + "time_per_iteration": 2.6625006198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077272, + "balance_loss_mlp": 1.04797006, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.054973078138002, + "language_loss": 0.83676195, + "learning_rate": 0.0003012716694518467, + "loss": 0.84753466, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.29272461, + "step": 3333, + "time_per_iteration": 2.5685575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077896, + "balance_loss_mlp": 1.04976213, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.06333005970855973, + "language_loss": 0.84833503, + "learning_rate": 0.000300985830297413, + "loss": 0.85911405, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.28149414, + "step": 3334, + "time_per_iteration": 2.7106077671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077366, + "balance_loss_mlp": 1.04875624, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.05617575604142134, + "language_loss": 0.87391257, + "learning_rate": 0.00030070006840713205, + "loss": 0.88468629, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.28613281, + "step": 3335, + "time_per_iteration": 3.390854835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04868436, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.055765507063515254, + "language_loss": 0.73336351, + "learning_rate": 0.000300414383891947, + "loss": 0.74412954, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.27954102, + "step": 3336, + "time_per_iteration": 2.8184750080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074814, + "balance_loss_mlp": 1.04713416, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04865343351033758, + "language_loss": 0.88524318, + "learning_rate": 0.00030012877686276973, + "loss": 0.89599127, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.27709961, + "step": 3337, + "time_per_iteration": 2.693716049194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_mlp": 1.04925418, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.05071900601819844, + "language_loss": 0.8653757, + "learning_rate": 0.0002998432474304832, + "loss": 0.87615323, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.28540039, + "step": 3338, + "time_per_iteration": 2.785625696182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014062, + "balance_loss_mlp": 1.00228393, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.008511369807607439, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80251408, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.11767578, + "step": 3339, + "time_per_iteration": 4.914938688278198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072832, + "balance_loss_mlp": 1.04531896, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04920072731588192, + "language_loss": 0.88676053, + "learning_rate": 0.00029927242179996107, + "loss": 0.89748889, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.27539062, + "step": 3340, + "time_per_iteration": 2.6910037994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075049, + "balance_loss_mlp": 1.04691517, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.050397080981132346, + "language_loss": 0.83332348, + "learning_rate": 0.0002989871258233398, + "loss": 0.84407395, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.28149414, + "step": 3341, + "time_per_iteration": 2.7581868171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.05337822, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.07038127558443963, + "language_loss": 0.82547259, + "learning_rate": 0.0002987019078868373, + "loss": 0.83629274, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.28613281, + "step": 3342, + "time_per_iteration": 2.4203991889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04792297, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05404588481803156, + "language_loss": 0.81465191, + "learning_rate": 0.00029841676810118484, + "loss": 0.8254106, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.27978516, + "step": 3343, + "time_per_iteration": 2.665461778640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_mlp": 1.04489374, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05709994868865375, + "language_loss": 0.8727839, + "learning_rate": 0.0002981317065770839, + "loss": 0.88351655, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.28344727, + "step": 3344, + "time_per_iteration": 3.0409646034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074581, + "balance_loss_mlp": 1.04592359, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.0669931178788996, + "language_loss": 0.80771047, + "learning_rate": 0.00029784672342520493, + "loss": 0.81845629, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.28662109, + "step": 3345, + "time_per_iteration": 2.69077730178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_mlp": 1.04541922, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.058634487951654345, + "language_loss": 0.83929563, + "learning_rate": 0.00029756181875618834, + "loss": 0.85003328, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.28369141, + "step": 3346, + "time_per_iteration": 2.5735673904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107364, + "balance_loss_mlp": 1.04541159, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.06920918115326812, + "language_loss": 0.83749354, + "learning_rate": 0.0002972769926806439, + "loss": 0.84823, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.28222656, + "step": 3347, + "time_per_iteration": 2.480320692062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071427, + "balance_loss_mlp": 1.04248285, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05946244063191617, + "language_loss": 0.88425148, + "learning_rate": 0.0002969922453091508, + "loss": 0.89496571, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.28930664, + "step": 3348, + "time_per_iteration": 2.5937469005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04441822, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04841561291850138, + "language_loss": 0.84831715, + "learning_rate": 0.00029670757675225777, + "loss": 0.85905439, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.29248047, + "step": 3349, + "time_per_iteration": 2.7379231452941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076606, + "balance_loss_mlp": 1.04754305, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.058104314548796505, + "language_loss": 0.79157209, + "learning_rate": 0.0002964229871204831, + "loss": 0.80233824, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.2902832, + "step": 3350, + "time_per_iteration": 2.6757731437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076273, + "balance_loss_mlp": 1.04663801, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.06774074305303925, + "language_loss": 0.83398223, + "learning_rate": 0.00029613847652431403, + "loss": 0.84474498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.29614258, + "step": 3351, + "time_per_iteration": 2.905512571334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072846, + "balance_loss_mlp": 1.04409289, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.05155589011440517, + "language_loss": 0.79040021, + "learning_rate": 0.0002958540450742078, + "loss": 0.80112863, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.28735352, + "step": 3352, + "time_per_iteration": 2.929170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070119, + "balance_loss_mlp": 1.04026914, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.05063101037277444, + "language_loss": 0.77325773, + "learning_rate": 0.0002955696928805901, + "loss": 0.78395891, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2980957, + "step": 3353, + "time_per_iteration": 2.881626605987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107236, + "balance_loss_mlp": 1.04229498, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.059706275301968766, + "language_loss": 0.86282456, + "learning_rate": 0.0002952854200538563, + "loss": 0.87354815, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.30004883, + "step": 3354, + "time_per_iteration": 2.8391265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070707, + "balance_loss_mlp": 1.04047608, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.08701934847838336, + "language_loss": 0.81666923, + "learning_rate": 0.000295001226704371, + "loss": 0.82737631, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.30175781, + "step": 3355, + "time_per_iteration": 2.598177194595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.0440042, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.06424201750770815, + "language_loss": 0.82413089, + "learning_rate": 0.00029471711294246783, + "loss": 0.83487391, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.30273438, + "step": 3356, + "time_per_iteration": 2.813361644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069796, + "balance_loss_mlp": 1.03880155, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.06119276712520419, + "language_loss": 0.82436061, + "learning_rate": 0.0002944330788784494, + "loss": 0.83505857, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.30957031, + "step": 3357, + "time_per_iteration": 2.8810949325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073631, + "balance_loss_mlp": 1.04399514, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.06225888545708514, + "language_loss": 0.84205008, + "learning_rate": 0.00029414912462258786, + "loss": 0.8527863, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.29614258, + "step": 3358, + "time_per_iteration": 2.827125310897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.0442096, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.06476670861286221, + "language_loss": 0.81335187, + "learning_rate": 0.00029386525028512366, + "loss": 0.82410085, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.30664062, + "step": 3359, + "time_per_iteration": 2.750802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04195285, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05574217129277394, + "language_loss": 0.86898518, + "learning_rate": 0.0002935814559762666, + "loss": 0.87971175, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.30664062, + "step": 3360, + "time_per_iteration": 2.778729200363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071986, + "balance_loss_mlp": 1.04125416, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.05463243527184519, + "language_loss": 0.79309767, + "learning_rate": 0.0002932977418061957, + "loss": 0.80381751, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.30712891, + "step": 3361, + "time_per_iteration": 2.636300563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072531, + "balance_loss_mlp": 1.04284823, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06447019250914547, + "language_loss": 0.80627209, + "learning_rate": 0.00029301410788505833, + "loss": 0.81699741, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.29638672, + "step": 3362, + "time_per_iteration": 2.7907180786132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.04127288, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06442175719622328, + "language_loss": 0.81014264, + "learning_rate": 0.00029273055432297126, + "loss": 0.8208527, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.29711914, + "step": 3363, + "time_per_iteration": 2.5577244758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068782, + "balance_loss_mlp": 1.03835917, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.055871885274250355, + "language_loss": 0.80490357, + "learning_rate": 0.00029244708123001917, + "loss": 0.81559139, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.30395508, + "step": 3364, + "time_per_iteration": 2.938917636871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065549, + "balance_loss_mlp": 1.0347929, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.060913516619686706, + "language_loss": 0.84265661, + "learning_rate": 0.0002921636887162565, + "loss": 0.85331213, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.30737305, + "step": 3365, + "time_per_iteration": 2.7420175075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_mlp": 1.03718054, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.07220364495800281, + "language_loss": 0.84047341, + "learning_rate": 0.00029188037689170595, + "loss": 0.85114586, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.30029297, + "step": 3366, + "time_per_iteration": 2.941958427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070259, + "balance_loss_mlp": 1.04026556, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.0698232037755488, + "language_loss": 0.84047693, + "learning_rate": 0.0002915971458663586, + "loss": 0.85117948, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.29931641, + "step": 3367, + "time_per_iteration": 3.0588743686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064684, + "balance_loss_mlp": 1.03507257, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.048093531739852514, + "language_loss": 0.81804395, + "learning_rate": 0.00029131399575017494, + "loss": 0.82869077, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.2956543, + "step": 3368, + "time_per_iteration": 3.194119691848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.05082024761534885, + "language_loss": 0.85855007, + "learning_rate": 0.0002910309266530836, + "loss": 0.86920446, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.29638672, + "step": 3369, + "time_per_iteration": 2.7995903491973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069305, + "balance_loss_mlp": 1.03943157, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.06123820960940181, + "language_loss": 0.85307527, + "learning_rate": 0.0002907479386849814, + "loss": 0.86376828, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.2980957, + "step": 3370, + "time_per_iteration": 2.6561813354492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03969884, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.06023552594522319, + "language_loss": 0.8010959, + "learning_rate": 0.0002904650319557339, + "loss": 0.81179738, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.30395508, + "step": 3371, + "time_per_iteration": 3.0036118030548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.03967094, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.06478850515629742, + "language_loss": 0.81106675, + "learning_rate": 0.0002901822065751758, + "loss": 0.82175934, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.29541016, + "step": 3372, + "time_per_iteration": 2.6287784576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.0429343, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.0516174175681091, + "language_loss": 0.854002, + "learning_rate": 0.0002898994626531093, + "loss": 0.86473012, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.29833984, + "step": 3373, + "time_per_iteration": 2.84863543510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071305, + "balance_loss_mlp": 1.04181266, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.07661916167941812, + "language_loss": 0.88111019, + "learning_rate": 0.00028961680029930526, + "loss": 0.89182317, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.29443359, + "step": 3374, + "time_per_iteration": 2.5185511112213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03965008, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.05286852382904046, + "language_loss": 0.76929349, + "learning_rate": 0.00028933421962350317, + "loss": 0.77998275, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.29248047, + "step": 3375, + "time_per_iteration": 2.7406935691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071172, + "balance_loss_mlp": 1.04020166, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.05602089532541189, + "language_loss": 0.84000719, + "learning_rate": 0.0002890517207354104, + "loss": 0.85071886, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.30932617, + "step": 3376, + "time_per_iteration": 2.8145668506622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072679, + "balance_loss_mlp": 1.04263854, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.05675413090178792, + "language_loss": 0.81828344, + "learning_rate": 0.0002887693037447029, + "loss": 0.82901019, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.30004883, + "step": 3377, + "time_per_iteration": 2.6432199478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070436, + "balance_loss_mlp": 1.04082441, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05935135112647285, + "language_loss": 0.82021838, + "learning_rate": 0.00028848696876102443, + "loss": 0.83092278, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.29541016, + "step": 3378, + "time_per_iteration": 2.6862215995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065633, + "balance_loss_mlp": 1.03473437, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.06179409995476596, + "language_loss": 0.83523512, + "learning_rate": 0.00028820471589398723, + "loss": 0.84589148, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.30859375, + "step": 3379, + "time_per_iteration": 2.5718047618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070203, + "balance_loss_mlp": 1.03970945, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06289552232740542, + "language_loss": 0.77402478, + "learning_rate": 0.00028792254525317196, + "loss": 0.78472686, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.30493164, + "step": 3380, + "time_per_iteration": 2.779308795928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071743, + "balance_loss_mlp": 1.0420599, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05486106257478186, + "language_loss": 0.81240368, + "learning_rate": 0.00028764045694812645, + "loss": 0.82312119, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.29638672, + "step": 3381, + "time_per_iteration": 2.7430598735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010701, + "balance_loss_mlp": 1.03936744, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.061364553922665516, + "language_loss": 0.76195431, + "learning_rate": 0.0002873584510883671, + "loss": 0.77265531, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.30688477, + "step": 3382, + "time_per_iteration": 2.575998306274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071659, + "balance_loss_mlp": 1.04085565, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.0719487575879366, + "language_loss": 0.85928071, + "learning_rate": 0.0002870765277833788, + "loss": 0.86999726, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.30761719, + "step": 3383, + "time_per_iteration": 2.7900807857513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.03790629, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.06613356509687102, + "language_loss": 0.80323064, + "learning_rate": 0.00028679468714261347, + "loss": 0.81392419, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.31445312, + "step": 3384, + "time_per_iteration": 2.7730093002319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04132867, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.06288254960309916, + "language_loss": 0.76734459, + "learning_rate": 0.0002865129292754918, + "loss": 0.77805495, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.29663086, + "step": 3385, + "time_per_iteration": 2.6205520629882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075067, + "balance_loss_mlp": 1.04500234, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.05411679726730615, + "language_loss": 0.81513727, + "learning_rate": 0.00028623125429140105, + "loss": 0.82588792, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.30004883, + "step": 3386, + "time_per_iteration": 2.88822340965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.03826463, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.05765553092239875, + "language_loss": 0.87005818, + "learning_rate": 0.00028594966229969785, + "loss": 0.88073337, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.29223633, + "step": 3387, + "time_per_iteration": 2.6889727115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074347, + "balance_loss_mlp": 1.04413986, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.05935709634506938, + "language_loss": 0.81214345, + "learning_rate": 0.00028566815340970577, + "loss": 0.82288694, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.30151367, + "step": 3388, + "time_per_iteration": 2.7500782012939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107152, + "balance_loss_mlp": 1.04195595, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.058132495029724875, + "language_loss": 0.8099978, + "learning_rate": 0.0002853867277307162, + "loss": 0.82071304, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.29516602, + "step": 3389, + "time_per_iteration": 2.628153085708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072178, + "balance_loss_mlp": 1.04399705, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.062440592290717876, + "language_loss": 0.82432795, + "learning_rate": 0.00028510538537198824, + "loss": 0.83504969, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.28198242, + "step": 3390, + "time_per_iteration": 2.6273562908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.04805326, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.0630008208317628, + "language_loss": 0.86511409, + "learning_rate": 0.00028482412644274867, + "loss": 0.87588215, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.28759766, + "step": 3391, + "time_per_iteration": 2.986837148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073216, + "balance_loss_mlp": 1.04479647, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.07544653210913753, + "language_loss": 0.74115705, + "learning_rate": 0.00028454295105219207, + "loss": 0.75188923, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.28417969, + "step": 3392, + "time_per_iteration": 2.6882169246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077343, + "balance_loss_mlp": 1.04837489, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044597775660838994, + "language_loss": 0.79517299, + "learning_rate": 0.0002842618593094802, + "loss": 0.80594641, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.28979492, + "step": 3393, + "time_per_iteration": 3.160513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076464, + "balance_loss_mlp": 1.04785347, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.0655151623947296, + "language_loss": 0.80225992, + "learning_rate": 0.00028398085132374243, + "loss": 0.81302458, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.28588867, + "step": 3394, + "time_per_iteration": 2.799607753753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04861116, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.057447645264245936, + "language_loss": 0.83968282, + "learning_rate": 0.0002836999272040761, + "loss": 0.85044694, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.27832031, + "step": 3395, + "time_per_iteration": 3.1404569149017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076476, + "balance_loss_mlp": 1.04753208, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.07221192979592671, + "language_loss": 0.83835298, + "learning_rate": 0.00028341908705954575, + "loss": 0.84911776, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.28955078, + "step": 3396, + "time_per_iteration": 2.586735248565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_mlp": 1.01340032, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.010103591992015052, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82786608, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.11376953, + "step": 3397, + "time_per_iteration": 4.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076371, + "balance_loss_mlp": 1.04754591, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.06325367812107179, + "language_loss": 0.78003663, + "learning_rate": 0.00028285765913198604, + "loss": 0.79080033, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.2878418, + "step": 3398, + "time_per_iteration": 2.583195209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073367, + "balance_loss_mlp": 1.04530561, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.055960254103937936, + "language_loss": 0.81894422, + "learning_rate": 0.0002825770715669227, + "loss": 0.82967794, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.28076172, + "step": 3399, + "time_per_iteration": 2.706880569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04842257, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.06150139712068683, + "language_loss": 0.80872452, + "learning_rate": 0.00028229656841292634, + "loss": 0.81948054, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.2722168, + "step": 3400, + "time_per_iteration": 2.6799252033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075202, + "balance_loss_mlp": 1.04687786, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.0638413236687058, + "language_loss": 0.76758403, + "learning_rate": 0.0002820161497788979, + "loss": 0.77833605, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.28320312, + "step": 3401, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04712176, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.051478933847507014, + "language_loss": 0.87136239, + "learning_rate": 0.00028173581577370545, + "loss": 0.88210893, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.27563477, + "step": 3402, + "time_per_iteration": 2.7428696155548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.04618084, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.05196967996925013, + "language_loss": 0.79016143, + "learning_rate": 0.0002814555665061844, + "loss": 0.80089623, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.2734375, + "step": 3403, + "time_per_iteration": 2.68853759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076544, + "balance_loss_mlp": 1.04914951, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.06812490536784549, + "language_loss": 0.77581179, + "learning_rate": 0.00028117540208513715, + "loss": 0.78657722, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.27416992, + "step": 3404, + "time_per_iteration": 2.668957233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0468924, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.06109241421727743, + "language_loss": 0.85329819, + "learning_rate": 0.00028089532261933313, + "loss": 0.86404049, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.27368164, + "step": 3405, + "time_per_iteration": 2.764646053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077427, + "balance_loss_mlp": 1.04910326, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.07801432785219843, + "language_loss": 0.85569102, + "learning_rate": 0.0002806153282175087, + "loss": 0.86646521, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.28369141, + "step": 3406, + "time_per_iteration": 2.612542152404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073707, + "balance_loss_mlp": 1.04547811, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.06580250942385472, + "language_loss": 0.82821441, + "learning_rate": 0.0002803354189883679, + "loss": 0.83895147, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.28222656, + "step": 3407, + "time_per_iteration": 2.8573250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.0526377, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.04760286447801195, + "language_loss": 0.8549965, + "learning_rate": 0.00028005559504058053, + "loss": 0.86579633, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.27392578, + "step": 3408, + "time_per_iteration": 2.723130941390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075013, + "balance_loss_mlp": 1.04623616, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05982952663886069, + "language_loss": 0.76448226, + "learning_rate": 0.0002797758564827838, + "loss": 0.77523243, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.28759766, + "step": 3409, + "time_per_iteration": 2.8227314949035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077669, + "balance_loss_mlp": 1.04989326, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.0665853509575856, + "language_loss": 0.83799911, + "learning_rate": 0.0002794962034235824, + "loss": 0.8487758, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.27783203, + "step": 3410, + "time_per_iteration": 2.6031951904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04303622, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.05829437169655771, + "language_loss": 0.74215448, + "learning_rate": 0.00027921663597154695, + "loss": 0.75286669, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.28198242, + "step": 3411, + "time_per_iteration": 2.735642910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04981232, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.0845273006742278, + "language_loss": 0.8108443, + "learning_rate": 0.00027893715423521525, + "loss": 0.8216204, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.27832031, + "step": 3412, + "time_per_iteration": 2.4407780170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079935, + "balance_loss_mlp": 1.05134881, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.06735556448920854, + "language_loss": 0.83940005, + "learning_rate": 0.00027865775832309163, + "loss": 0.85019946, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.28564453, + "step": 3413, + "time_per_iteration": 2.6473381519317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076667, + "balance_loss_mlp": 1.04870033, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.0593593517708546, + "language_loss": 0.85890168, + "learning_rate": 0.00027837844834364733, + "loss": 0.86966836, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.27978516, + "step": 3414, + "time_per_iteration": 2.632337808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074793, + "balance_loss_mlp": 1.04663622, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.056143783747438114, + "language_loss": 0.86344767, + "learning_rate": 0.00027809922440532, + "loss": 0.87419558, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.28173828, + "step": 3415, + "time_per_iteration": 2.8158276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.04152656, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.052293686608573205, + "language_loss": 0.80653661, + "learning_rate": 0.00027782008661651406, + "loss": 0.81724513, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.29272461, + "step": 3416, + "time_per_iteration": 2.769740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075321, + "balance_loss_mlp": 1.04706836, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.047338775202516, + "language_loss": 0.87086004, + "learning_rate": 0.00027754103508560013, + "loss": 0.88161325, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.2824707, + "step": 3417, + "time_per_iteration": 2.5982823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070746, + "balance_loss_mlp": 1.04204035, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.07606703809766882, + "language_loss": 0.82847452, + "learning_rate": 0.0002772620699209163, + "loss": 0.83918196, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.28686523, + "step": 3418, + "time_per_iteration": 2.5715713500976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072273, + "balance_loss_mlp": 1.04387712, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.06477726519797523, + "language_loss": 0.79822147, + "learning_rate": 0.0002769831912307658, + "loss": 0.80894423, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.28393555, + "step": 3419, + "time_per_iteration": 2.554229974746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081387, + "balance_loss_mlp": 1.05339622, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.06482840979987209, + "language_loss": 0.80168855, + "learning_rate": 0.00027670439912341917, + "loss": 0.81250238, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.2800293, + "step": 3420, + "time_per_iteration": 2.6077942848205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.05385685, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.062198061395391364, + "language_loss": 0.83608246, + "learning_rate": 0.0002764256937071129, + "loss": 0.8469131, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.29199219, + "step": 3421, + "time_per_iteration": 2.7814555168151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079993, + "balance_loss_mlp": 1.0516932, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.06741584728715999, + "language_loss": 0.87078255, + "learning_rate": 0.00027614707509005036, + "loss": 0.8815825, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.28320312, + "step": 3422, + "time_per_iteration": 2.6582610607147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080132, + "balance_loss_mlp": 1.05216599, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.05422221992549149, + "language_loss": 0.79046404, + "learning_rate": 0.0002758685433804008, + "loss": 0.80126542, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.2800293, + "step": 3423, + "time_per_iteration": 2.518541097640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080526, + "balance_loss_mlp": 1.05196333, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.07879518089190286, + "language_loss": 0.79578894, + "learning_rate": 0.00027559009868630005, + "loss": 0.80659419, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.28564453, + "step": 3424, + "time_per_iteration": 3.0996036529541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079504, + "balance_loss_mlp": 1.0518713, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05918528826128724, + "language_loss": 0.79852736, + "learning_rate": 0.0002753117411158491, + "loss": 0.80932236, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.27661133, + "step": 3425, + "time_per_iteration": 3.0297467708587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082154, + "balance_loss_mlp": 1.05392551, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.05414938091888711, + "language_loss": 0.89781225, + "learning_rate": 0.0002750334707771168, + "loss": 0.90863383, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.2824707, + "step": 3426, + "time_per_iteration": 2.639045476913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082665, + "balance_loss_mlp": 1.05364943, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.06850883476210408, + "language_loss": 0.8080318, + "learning_rate": 0.0002747552877781369, + "loss": 0.81885844, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.28979492, + "step": 3427, + "time_per_iteration": 2.49623966217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.04967833, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.05956339540339285, + "language_loss": 0.81955504, + "learning_rate": 0.0002744771922269097, + "loss": 0.83032882, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.27709961, + "step": 3428, + "time_per_iteration": 2.730713129043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071709, + "balance_loss_mlp": 1.04276502, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.06328482299945191, + "language_loss": 0.82119536, + "learning_rate": 0.0002741991842314015, + "loss": 0.83191252, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.28930664, + "step": 3429, + "time_per_iteration": 3.479928970336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072277, + "balance_loss_mlp": 1.04433429, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05605661810668252, + "language_loss": 0.85796869, + "learning_rate": 0.0002739212638995445, + "loss": 0.86869144, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.27954102, + "step": 3430, + "time_per_iteration": 2.606570243835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04579639, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.06049343964764478, + "language_loss": 0.82845342, + "learning_rate": 0.00027364343133923696, + "loss": 0.83919537, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.28393555, + "step": 3431, + "time_per_iteration": 2.670698642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010757, + "balance_loss_mlp": 1.04632664, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.060306061289427934, + "language_loss": 0.8290168, + "learning_rate": 0.0002733656866583431, + "loss": 0.83977377, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.29321289, + "step": 3432, + "time_per_iteration": 2.6917898654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107317, + "balance_loss_mlp": 1.04413056, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.07899452936934231, + "language_loss": 0.83071327, + "learning_rate": 0.0002730880299646927, + "loss": 0.84144497, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.2902832, + "step": 3433, + "time_per_iteration": 3.028512954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.03898394, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05867349384550741, + "language_loss": 0.85263318, + "learning_rate": 0.0002728104613660821, + "loss": 0.86331582, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.29272461, + "step": 3434, + "time_per_iteration": 2.8600428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.04666591, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.08754685065456504, + "language_loss": 0.82922065, + "learning_rate": 0.0002725329809702729, + "loss": 0.83996743, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.28051758, + "step": 3435, + "time_per_iteration": 3.2159268856048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.04002786, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06770839009461412, + "language_loss": 0.76381433, + "learning_rate": 0.0002722555888849921, + "loss": 0.77449906, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.28417969, + "step": 3436, + "time_per_iteration": 3.435774564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071105, + "balance_loss_mlp": 1.04297185, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05996981510942144, + "language_loss": 0.8029291, + "learning_rate": 0.00027197828521793334, + "loss": 0.81364018, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.28125, + "step": 3437, + "time_per_iteration": 2.5626087188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04681671, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.059440388308285685, + "language_loss": 0.84535551, + "learning_rate": 0.0002717010700767552, + "loss": 0.85612053, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.29614258, + "step": 3438, + "time_per_iteration": 2.74114990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071656, + "balance_loss_mlp": 1.04254496, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.07105561276386183, + "language_loss": 0.7574169, + "learning_rate": 0.00027142394356908226, + "loss": 0.76813346, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.29077148, + "step": 3439, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107167, + "balance_loss_mlp": 1.04289341, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.061991918055260026, + "language_loss": 0.84383535, + "learning_rate": 0.00027114690580250456, + "loss": 0.85455203, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.2878418, + "step": 3440, + "time_per_iteration": 2.770521879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068436, + "balance_loss_mlp": 1.03996921, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.055271996541099454, + "language_loss": 0.8711971, + "learning_rate": 0.0002708699568845776, + "loss": 0.88188148, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.28466797, + "step": 3441, + "time_per_iteration": 2.634669303894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020343, + "balance_loss_mlp": 1.00923228, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.011806654304651203, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80308127, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.11132812, + "step": 3442, + "time_per_iteration": 4.947353363037109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074491, + "balance_loss_mlp": 1.04609489, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.055374659837301436, + "language_loss": 0.82784879, + "learning_rate": 0.0002703163260247261, + "loss": 0.83859372, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.28369141, + "step": 3443, + "time_per_iteration": 2.664637804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069476, + "balance_loss_mlp": 1.04041255, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.06501168506799739, + "language_loss": 0.81707942, + "learning_rate": 0.0002700396442977399, + "loss": 0.82777417, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.2902832, + "step": 3444, + "time_per_iteration": 2.616928815841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04049635, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.054380463480794276, + "language_loss": 0.84038997, + "learning_rate": 0.0002697630518492817, + "loss": 0.85108292, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.28833008, + "step": 3445, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071356, + "balance_loss_mlp": 1.04207826, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.06943834744074738, + "language_loss": 0.85656464, + "learning_rate": 0.0002694865487867343, + "loss": 0.86727822, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.29223633, + "step": 3446, + "time_per_iteration": 2.624187707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072189, + "balance_loss_mlp": 1.04241085, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.05377374950460666, + "language_loss": 0.84776872, + "learning_rate": 0.0002692101352174453, + "loss": 0.85849059, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.29736328, + "step": 3447, + "time_per_iteration": 2.786705255508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066769, + "balance_loss_mlp": 1.03823054, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.06088849613608419, + "language_loss": 0.84652716, + "learning_rate": 0.00026893381124872787, + "loss": 0.8571949, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.28515625, + "step": 3448, + "time_per_iteration": 2.8100626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072364, + "balance_loss_mlp": 1.04272866, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.06845751497679059, + "language_loss": 0.80441087, + "learning_rate": 0.00026865757698786097, + "loss": 0.81513453, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.29589844, + "step": 3449, + "time_per_iteration": 3.046318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069481, + "balance_loss_mlp": 1.04065669, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05206136562356657, + "language_loss": 0.81613761, + "learning_rate": 0.000268381432542088, + "loss": 0.82683241, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.28833008, + "step": 3450, + "time_per_iteration": 2.865903854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107193, + "balance_loss_mlp": 1.04203212, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.0645327848257149, + "language_loss": 0.79875302, + "learning_rate": 0.00026810537801861807, + "loss": 0.80947232, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.29882812, + "step": 3451, + "time_per_iteration": 2.8374693393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071564, + "balance_loss_mlp": 1.04173839, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.05151691249818879, + "language_loss": 0.8142612, + "learning_rate": 0.0002678294135246243, + "loss": 0.82497692, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.2980957, + "step": 3452, + "time_per_iteration": 2.839822769165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.04313636, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05848171422306997, + "language_loss": 0.86315292, + "learning_rate": 0.0002675535391672463, + "loss": 0.87387323, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.2890625, + "step": 3453, + "time_per_iteration": 3.184783458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074712, + "balance_loss_mlp": 1.04574442, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.06167080451779571, + "language_loss": 0.86087596, + "learning_rate": 0.0002672777550535877, + "loss": 0.8716231, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.28979492, + "step": 3454, + "time_per_iteration": 2.8803153038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071993, + "balance_loss_mlp": 1.0427866, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05419695506055875, + "language_loss": 0.84890383, + "learning_rate": 0.00026700206129071747, + "loss": 0.85962379, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.29174805, + "step": 3455, + "time_per_iteration": 2.835059881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076439, + "balance_loss_mlp": 1.04749477, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.06321625044537839, + "language_loss": 0.88953322, + "learning_rate": 0.00026672645798566925, + "loss": 0.90029758, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.28930664, + "step": 3456, + "time_per_iteration": 3.0997443199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071835, + "balance_loss_mlp": 1.04277229, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.055285478182730885, + "language_loss": 0.79457712, + "learning_rate": 0.00026645094524544225, + "loss": 0.80529541, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.2902832, + "step": 3457, + "time_per_iteration": 3.513991117477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107703, + "balance_loss_mlp": 1.0481813, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.045511024743111715, + "language_loss": 0.75222224, + "learning_rate": 0.00026617552317699945, + "loss": 0.7629925, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.28833008, + "step": 3458, + "time_per_iteration": 3.5000369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_mlp": 1.04062915, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.0575678465485099, + "language_loss": 0.8684063, + "learning_rate": 0.0002659001918872693, + "loss": 0.87909818, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.28564453, + "step": 3459, + "time_per_iteration": 3.1579606533050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076447, + "balance_loss_mlp": 1.04797983, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.057947477452726895, + "language_loss": 0.80655402, + "learning_rate": 0.0002656249514831449, + "loss": 0.8173185, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.28466797, + "step": 3460, + "time_per_iteration": 3.0136172771453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075105, + "balance_loss_mlp": 1.04527879, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05880599704270715, + "language_loss": 0.86742055, + "learning_rate": 0.00026534980207148416, + "loss": 0.87817168, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.2980957, + "step": 3461, + "time_per_iteration": 3.808920383453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070751, + "balance_loss_mlp": 1.04256988, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06394653558237288, + "language_loss": 0.73634577, + "learning_rate": 0.0002650747437591097, + "loss": 0.74705327, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.28149414, + "step": 3462, + "time_per_iteration": 3.4438018798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023937, + "balance_loss_mlp": 1.01258874, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.01627441049927099, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82903516, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.11328125, + "step": 3463, + "time_per_iteration": 5.9989097118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069258, + "balance_loss_mlp": 1.04091001, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.05970416842123876, + "language_loss": 0.86439729, + "learning_rate": 0.00026452490085933155, + "loss": 0.87508994, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.28393555, + "step": 3464, + "time_per_iteration": 3.074321985244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069725, + "balance_loss_mlp": 1.04099607, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.06389669613772958, + "language_loss": 0.89814323, + "learning_rate": 0.00026425011648539614, + "loss": 0.90884054, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.28735352, + "step": 3465, + "time_per_iteration": 3.163724422454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067748, + "balance_loss_mlp": 1.0391376, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05866867334399115, + "language_loss": 0.82531869, + "learning_rate": 0.00026397542363768267, + "loss": 0.83599609, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.28588867, + "step": 3466, + "time_per_iteration": 3.15535044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107073, + "balance_loss_mlp": 1.04202461, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.09718909208334105, + "language_loss": 0.81696969, + "learning_rate": 0.0002637008224228362, + "loss": 0.82767701, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.28710938, + "step": 3467, + "time_per_iteration": 3.1590065956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_mlp": 1.04467225, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.045698097527158366, + "language_loss": 0.84370708, + "learning_rate": 0.00026342631294746653, + "loss": 0.85443497, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.28100586, + "step": 3468, + "time_per_iteration": 3.2474896907806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_mlp": 1.03933835, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.048489338364625344, + "language_loss": 0.80841875, + "learning_rate": 0.0002631518953181476, + "loss": 0.81909585, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.28369141, + "step": 3469, + "time_per_iteration": 3.989240884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020296, + "balance_loss_mlp": 1.00837493, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.017053008774153198, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7734558, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.11914062, + "step": 3470, + "time_per_iteration": 5.7656426429748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079857, + "balance_loss_mlp": 1.05081761, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.06105820471136532, + "language_loss": 0.80315661, + "learning_rate": 0.00026260333602377985, + "loss": 0.81395519, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.29003906, + "step": 3471, + "time_per_iteration": 3.3436222076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072083, + "balance_loss_mlp": 1.04383063, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.05421906937668894, + "language_loss": 0.87085468, + "learning_rate": 0.0002623291945717007, + "loss": 0.88157558, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.28271484, + "step": 3472, + "time_per_iteration": 3.1183881759643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071602, + "balance_loss_mlp": 1.04234779, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.04666604751333496, + "language_loss": 0.84075844, + "learning_rate": 0.00026205514539161175, + "loss": 0.85147452, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.29248047, + "step": 3473, + "time_per_iteration": 3.790060043334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04386711, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.05776060177542925, + "language_loss": 0.84147954, + "learning_rate": 0.00026178118858990773, + "loss": 0.85220551, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.28686523, + "step": 3474, + "time_per_iteration": 3.4138669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071797, + "balance_loss_mlp": 1.04259038, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.05528533566381529, + "language_loss": 0.83995008, + "learning_rate": 0.0002615073242729483, + "loss": 0.85066801, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.29223633, + "step": 3475, + "time_per_iteration": 3.199012279510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.0421505, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.04758123025754447, + "language_loss": 0.84358716, + "learning_rate": 0.0002612335525470573, + "loss": 0.85429692, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.2878418, + "step": 3476, + "time_per_iteration": 3.4972333908081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04572678, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.06222514745321995, + "language_loss": 0.78151464, + "learning_rate": 0.0002609598735185221, + "loss": 0.79225659, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.28466797, + "step": 3477, + "time_per_iteration": 3.1121668815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.04186535, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.05831077718695847, + "language_loss": 0.83306509, + "learning_rate": 0.00026068628729359445, + "loss": 0.84377104, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.28686523, + "step": 3478, + "time_per_iteration": 3.4748337268829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075594, + "balance_loss_mlp": 1.04653037, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.053072339735848705, + "language_loss": 0.75823909, + "learning_rate": 0.00026041279397848996, + "loss": 0.76899505, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.29003906, + "step": 3479, + "time_per_iteration": 3.3513095378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071758, + "balance_loss_mlp": 1.04279053, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.11523786601732237, + "language_loss": 0.82653117, + "learning_rate": 0.00026013939367938797, + "loss": 0.83724874, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.28930664, + "step": 3480, + "time_per_iteration": 3.341496467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.0417881, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05240024743638074, + "language_loss": 0.81095958, + "learning_rate": 0.00025986608650243204, + "loss": 0.82166409, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.28613281, + "step": 3481, + "time_per_iteration": 3.534395933151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073143, + "balance_loss_mlp": 1.04417491, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.04897639091923761, + "language_loss": 0.79360926, + "learning_rate": 0.0002595928725537293, + "loss": 0.80434066, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.28930664, + "step": 3482, + "time_per_iteration": 3.4163737297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.04179811, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05847572955345742, + "language_loss": 0.88153374, + "learning_rate": 0.0002593197519393509, + "loss": 0.89223981, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.28833008, + "step": 3483, + "time_per_iteration": 3.162363052368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.03851843, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.04895962963004684, + "language_loss": 0.79643184, + "learning_rate": 0.00025904672476533165, + "loss": 0.80710858, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.29125977, + "step": 3484, + "time_per_iteration": 3.329540967941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.0442394, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.055271412051917726, + "language_loss": 0.82509005, + "learning_rate": 0.0002587737911376704, + "loss": 0.8358202, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.28759766, + "step": 3485, + "time_per_iteration": 3.2979683876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04063249, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.05525585278416293, + "language_loss": 0.8399781, + "learning_rate": 0.00025850095116232885, + "loss": 0.85067225, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.28759766, + "step": 3486, + "time_per_iteration": 3.26407790184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069925, + "balance_loss_mlp": 1.04012239, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.05884470939634603, + "language_loss": 0.78008693, + "learning_rate": 0.000258228204945233, + "loss": 0.79078615, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.29760742, + "step": 3487, + "time_per_iteration": 3.2713074684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069596, + "balance_loss_mlp": 1.04122472, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.08825995079793632, + "language_loss": 0.84371996, + "learning_rate": 0.00025795555259227254, + "loss": 0.85441601, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.28369141, + "step": 3488, + "time_per_iteration": 3.2798845767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.04253244, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.04912618775842026, + "language_loss": 0.8368836, + "learning_rate": 0.00025768299420930046, + "loss": 0.84759241, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.28369141, + "step": 3489, + "time_per_iteration": 3.548513174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070862, + "balance_loss_mlp": 1.04191756, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.0542630721977733, + "language_loss": 0.83150196, + "learning_rate": 0.0002574105299021332, + "loss": 0.84221053, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.28930664, + "step": 3490, + "time_per_iteration": 3.264068365097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072429, + "balance_loss_mlp": 1.04398608, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.04887866872345111, + "language_loss": 0.84103191, + "learning_rate": 0.00025713815977655084, + "loss": 0.85175616, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.28466797, + "step": 3491, + "time_per_iteration": 3.480595827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067719, + "balance_loss_mlp": 1.03848863, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.061790986714500215, + "language_loss": 0.84740448, + "learning_rate": 0.0002568658839382969, + "loss": 0.8580817, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.29199219, + "step": 3492, + "time_per_iteration": 3.149390935897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04366422, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.060742623870238814, + "language_loss": 0.84422779, + "learning_rate": 0.00025659370249307814, + "loss": 0.85494649, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.28198242, + "step": 3493, + "time_per_iteration": 3.043328285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067893, + "balance_loss_mlp": 1.03840065, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.32090754121455606, + "language_loss": 0.85042375, + "learning_rate": 0.00025632161554656473, + "loss": 0.86110264, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.29492188, + "step": 3494, + "time_per_iteration": 3.370725393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_mlp": 1.04256368, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.056395041319593345, + "language_loss": 0.82224226, + "learning_rate": 0.00025604962320439017, + "loss": 0.8329578, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.28955078, + "step": 3495, + "time_per_iteration": 3.1383168697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069781, + "balance_loss_mlp": 1.04155231, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.05570764429404915, + "language_loss": 0.82211316, + "learning_rate": 0.0002557777255721516, + "loss": 0.832811, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2824707, + "step": 3496, + "time_per_iteration": 3.2747058868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073188, + "balance_loss_mlp": 1.0451498, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.06368144256739344, + "language_loss": 0.8063888, + "learning_rate": 0.0002555059227554087, + "loss": 0.81712067, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.28027344, + "step": 3497, + "time_per_iteration": 3.241708278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078052, + "balance_loss_mlp": 1.04920387, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.05624574913237251, + "language_loss": 0.77828801, + "learning_rate": 0.00025523421485968453, + "loss": 0.78906852, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.28833008, + "step": 3498, + "time_per_iteration": 3.4185025691986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071507, + "balance_loss_mlp": 1.04327822, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05832714819515366, + "language_loss": 0.85479802, + "learning_rate": 0.00025496260199046585, + "loss": 0.86551309, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.28271484, + "step": 3499, + "time_per_iteration": 3.398684501647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04531085, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.0606160593453579, + "language_loss": 0.84417158, + "learning_rate": 0.000254691084253202, + "loss": 0.85491526, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.29052734, + "step": 3500, + "time_per_iteration": 3.204657554626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075309, + "balance_loss_mlp": 1.04641259, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.05651280486547688, + "language_loss": 0.7721619, + "learning_rate": 0.00025441966175330567, + "loss": 0.782915, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2890625, + "step": 3501, + "time_per_iteration": 3.280398368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079946, + "balance_loss_mlp": 1.05078757, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.09712144532107508, + "language_loss": 0.79372454, + "learning_rate": 0.00025414833459615183, + "loss": 0.804524, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.29174805, + "step": 3502, + "time_per_iteration": 3.221496343612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079859, + "balance_loss_mlp": 1.0510819, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05864951358988012, + "language_loss": 0.80395651, + "learning_rate": 0.0002538771028870796, + "loss": 0.81475508, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.28759766, + "step": 3503, + "time_per_iteration": 3.3205838203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075878, + "balance_loss_mlp": 1.04710114, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.060463290728931994, + "language_loss": 0.81723624, + "learning_rate": 0.0002536059667313903, + "loss": 0.827995, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2878418, + "step": 3504, + "time_per_iteration": 3.39898419380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04415321, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056146401144420426, + "language_loss": 0.89261472, + "learning_rate": 0.0002533349262343483, + "loss": 0.90334713, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.29077148, + "step": 3505, + "time_per_iteration": 3.3431026935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.04342639, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.0612472301672692, + "language_loss": 0.82005084, + "learning_rate": 0.0002530639815011807, + "loss": 0.83077168, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.28662109, + "step": 3506, + "time_per_iteration": 2.985283374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070171, + "balance_loss_mlp": 1.04220426, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.059607136715137135, + "language_loss": 0.84537947, + "learning_rate": 0.0002527931326370781, + "loss": 0.85608113, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.27978516, + "step": 3507, + "time_per_iteration": 3.1282057762145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071183, + "balance_loss_mlp": 1.04271555, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05533021024656612, + "language_loss": 0.82755983, + "learning_rate": 0.00025252237974719276, + "loss": 0.83827162, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.28491211, + "step": 3508, + "time_per_iteration": 3.260610580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066579, + "balance_loss_mlp": 1.03813529, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05860673503825768, + "language_loss": 0.80004764, + "learning_rate": 0.00025225172293664056, + "loss": 0.81071347, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.28442383, + "step": 3509, + "time_per_iteration": 3.373530864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.00540209, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.014769475443499856, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77950692, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.12158203, + "step": 3510, + "time_per_iteration": 6.158355951309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080364, + "balance_loss_mlp": 1.05111003, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06842841117996161, + "language_loss": 0.84400952, + "learning_rate": 0.00025171069797381106, + "loss": 0.8548131, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.29248047, + "step": 3511, + "time_per_iteration": 3.2980220317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070527, + "balance_loss_mlp": 1.04234552, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.0575194424100886, + "language_loss": 0.81909519, + "learning_rate": 0.00025144033003157864, + "loss": 0.82980049, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.28173828, + "step": 3512, + "time_per_iteration": 3.140373706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071116, + "balance_loss_mlp": 1.04319715, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.07351376561683495, + "language_loss": 0.78513837, + "learning_rate": 0.00025117005858876806, + "loss": 0.7958495, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.27978516, + "step": 3513, + "time_per_iteration": 3.3946895599365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070978, + "balance_loss_mlp": 1.04212952, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.056817312971520956, + "language_loss": 0.85350752, + "learning_rate": 0.000250899883750308, + "loss": 0.86421728, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.28881836, + "step": 3514, + "time_per_iteration": 3.2081196308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071843, + "balance_loss_mlp": 1.04368556, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.05856137084704242, + "language_loss": 0.81469542, + "learning_rate": 0.00025062980562109006, + "loss": 0.82541388, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.28173828, + "step": 3515, + "time_per_iteration": 3.234687566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070317, + "balance_loss_mlp": 1.04268479, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.0684742974897707, + "language_loss": 0.8283475, + "learning_rate": 0.0002503598243059677, + "loss": 0.83905065, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.27685547, + "step": 3516, + "time_per_iteration": 3.276319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.04684663, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.05816726448499056, + "language_loss": 0.80307925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81382906, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.28149414, + "step": 3517, + "time_per_iteration": 3.361901044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073931, + "balance_loss_mlp": 1.0454638, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.06530995059631492, + "language_loss": 0.85096681, + "learning_rate": 0.0002498201525372359, + "loss": 0.86170614, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.28491211, + "step": 3518, + "time_per_iteration": 3.10380220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010719, + "balance_loss_mlp": 1.04421926, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.061284941283787836, + "language_loss": 0.83024853, + "learning_rate": 0.00024955046229314584, + "loss": 0.84096754, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.27709961, + "step": 3519, + "time_per_iteration": 3.1552722454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069226, + "balance_loss_mlp": 1.04195142, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.06591388650746736, + "language_loss": 0.87507355, + "learning_rate": 0.00024928086928218947, + "loss": 0.88576579, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.27307129, + "step": 3520, + "time_per_iteration": 3.176281452178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073411, + "balance_loss_mlp": 1.04553986, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.06204053550598198, + "language_loss": 0.76553816, + "learning_rate": 0.00024901137360903216, + "loss": 0.7762723, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.27905273, + "step": 3521, + "time_per_iteration": 3.2491977214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075413, + "balance_loss_mlp": 1.04773283, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.06068405228401802, + "language_loss": 0.80714798, + "learning_rate": 0.00024874197537830115, + "loss": 0.81790209, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.27734375, + "step": 3522, + "time_per_iteration": 3.2800705432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069929, + "balance_loss_mlp": 1.04258251, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.0705299171766763, + "language_loss": 0.83310688, + "learning_rate": 0.00024847267469458684, + "loss": 0.84380615, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.27392578, + "step": 3523, + "time_per_iteration": 3.044410228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072093, + "balance_loss_mlp": 1.04400754, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.05514098679922032, + "language_loss": 0.77547973, + "learning_rate": 0.00024820347166244034, + "loss": 0.78620064, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.28100586, + "step": 3524, + "time_per_iteration": 3.3789007663726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074799, + "balance_loss_mlp": 1.04697526, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.05352508807919392, + "language_loss": 0.84795761, + "learning_rate": 0.0002479343663863755, + "loss": 0.85870552, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.27856445, + "step": 3525, + "time_per_iteration": 3.242717742919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04462886, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.06320153638070183, + "language_loss": 0.76689994, + "learning_rate": 0.00024766535897086876, + "loss": 0.77762568, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.27929688, + "step": 3526, + "time_per_iteration": 3.28702712059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107187, + "balance_loss_mlp": 1.04366529, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.06947465366955115, + "language_loss": 0.79284716, + "learning_rate": 0.0002473964495203578, + "loss": 0.80356586, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.28222656, + "step": 3527, + "time_per_iteration": 3.2413079738616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107552, + "balance_loss_mlp": 1.0474577, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.05313281252101078, + "language_loss": 0.8542428, + "learning_rate": 0.0002471276381392425, + "loss": 0.86499798, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.28076172, + "step": 3528, + "time_per_iteration": 3.3680808544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_mlp": 1.02044225, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.015931191486776266, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79221857, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.12792969, + "step": 3529, + "time_per_iteration": 5.628952741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069556, + "balance_loss_mlp": 1.04094601, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.06736468086197074, + "language_loss": 0.84283829, + "learning_rate": 0.00024659031000260826, + "loss": 0.85353386, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.28588867, + "step": 3530, + "time_per_iteration": 2.8723843097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.04080772, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.0688001707056691, + "language_loss": 0.80604416, + "learning_rate": 0.0002463217934556985, + "loss": 0.81674021, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.28808594, + "step": 3531, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_mlp": 1.01316202, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.012819798724274224, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77557838, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12597656, + "step": 3532, + "time_per_iteration": 4.774993181228638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069098, + "balance_loss_mlp": 1.04098845, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.07494627627994242, + "language_loss": 0.83949304, + "learning_rate": 0.0002457850559259306, + "loss": 0.85018402, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.28125, + "step": 3533, + "time_per_iteration": 2.854862928390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069128, + "balance_loss_mlp": 1.04123271, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05955036314433414, + "language_loss": 0.81432045, + "learning_rate": 0.00024551683515145275, + "loss": 0.82501173, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.27905273, + "step": 3534, + "time_per_iteration": 2.662670612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068932, + "balance_loss_mlp": 1.04084659, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.05698546166287553, + "language_loss": 0.86435509, + "learning_rate": 0.0002452487131761014, + "loss": 0.87504447, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.28125, + "step": 3535, + "time_per_iteration": 2.7052507400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068803, + "balance_loss_mlp": 1.0406456, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.2007355544417899, + "language_loss": 0.79636157, + "learning_rate": 0.00024498069010397093, + "loss": 0.80704963, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.28198242, + "step": 3536, + "time_per_iteration": 2.6741490364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073159, + "balance_loss_mlp": 1.04452467, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.06175774783534356, + "language_loss": 0.85386938, + "learning_rate": 0.00024471276603911697, + "loss": 0.86460102, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.28613281, + "step": 3537, + "time_per_iteration": 2.582512378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_mlp": 1.04049325, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.05665258990060116, + "language_loss": 0.79265833, + "learning_rate": 0.0002444449410855572, + "loss": 0.80335104, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.28759766, + "step": 3538, + "time_per_iteration": 2.7172720432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075887, + "balance_loss_mlp": 1.04689479, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.04143612880488866, + "language_loss": 0.84057069, + "learning_rate": 0.00024417721534727033, + "loss": 0.85132951, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.29003906, + "step": 3539, + "time_per_iteration": 2.6684606075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072025, + "balance_loss_mlp": 1.04322374, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.07425691047539493, + "language_loss": 0.82827783, + "learning_rate": 0.00024390958892819687, + "loss": 0.83899808, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.28759766, + "step": 3540, + "time_per_iteration": 2.4658186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107288, + "balance_loss_mlp": 1.04481781, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.05780068585896815, + "language_loss": 0.80981314, + "learning_rate": 0.0002436420619322381, + "loss": 0.82054192, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.28100586, + "step": 3541, + "time_per_iteration": 2.8231966495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077487, + "balance_loss_mlp": 1.04835224, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.05333594930296874, + "language_loss": 0.82771194, + "learning_rate": 0.0002433746344632577, + "loss": 0.83848679, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.29101562, + "step": 3542, + "time_per_iteration": 2.6959166526794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071587, + "balance_loss_mlp": 1.04259515, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.224573626709811, + "language_loss": 0.80137914, + "learning_rate": 0.00024310730662508006, + "loss": 0.81209499, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.28955078, + "step": 3543, + "time_per_iteration": 3.0683388710021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075151, + "balance_loss_mlp": 1.04639745, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05641923702729484, + "language_loss": 0.87227619, + "learning_rate": 0.0002428400785214911, + "loss": 0.88302767, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.28759766, + "step": 3544, + "time_per_iteration": 2.602978467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075917, + "balance_loss_mlp": 1.04830861, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.05415791739342902, + "language_loss": 0.82201838, + "learning_rate": 0.00024257295025623794, + "loss": 0.83277762, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.27636719, + "step": 3545, + "time_per_iteration": 2.8973493576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079854, + "balance_loss_mlp": 1.05074358, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.05879535961793021, + "language_loss": 0.8075946, + "learning_rate": 0.00024230592193302892, + "loss": 0.81839317, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.29077148, + "step": 3546, + "time_per_iteration": 2.8674380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079529, + "balance_loss_mlp": 1.0514431, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.05930658835110869, + "language_loss": 0.84390098, + "learning_rate": 0.00024203899365553372, + "loss": 0.85469627, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.28100586, + "step": 3547, + "time_per_iteration": 2.570162773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_mlp": 1.03785849, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.024142362504210636, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7778371, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11474609, + "step": 3548, + "time_per_iteration": 4.54862117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.0492295, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05396480474730288, + "language_loss": 0.82952201, + "learning_rate": 0.00024150543765216848, + "loss": 0.84029901, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.28442383, + "step": 3549, + "time_per_iteration": 2.8922061920166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081348, + "balance_loss_mlp": 1.05261874, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.08705135979463063, + "language_loss": 0.83172846, + "learning_rate": 0.00024123881013344352, + "loss": 0.84254193, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.28735352, + "step": 3550, + "time_per_iteration": 2.674441337585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081968, + "balance_loss_mlp": 1.05381048, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.052816648102186906, + "language_loss": 0.79533482, + "learning_rate": 0.00024097228307472202, + "loss": 0.80615449, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.28173828, + "step": 3551, + "time_per_iteration": 2.810211181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108367, + "balance_loss_mlp": 1.0561564, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.06537057112409075, + "language_loss": 0.82174456, + "learning_rate": 0.00024070585657947846, + "loss": 0.83258128, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.27563477, + "step": 3552, + "time_per_iteration": 2.903355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_mlp": 1.05487537, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.04571103673496298, + "language_loss": 0.85090339, + "learning_rate": 0.00024043953075114934, + "loss": 0.86174351, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.29150391, + "step": 3553, + "time_per_iteration": 2.683868169784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085174, + "balance_loss_mlp": 1.05711174, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06261928817671675, + "language_loss": 0.88604438, + "learning_rate": 0.00024017330569313128, + "loss": 0.89689612, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.28051758, + "step": 3554, + "time_per_iteration": 2.7235445976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085006, + "balance_loss_mlp": 1.05611026, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05900054168258606, + "language_loss": 0.74906945, + "learning_rate": 0.0002399071815087821, + "loss": 0.75991952, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.28857422, + "step": 3555, + "time_per_iteration": 3.0646519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085121, + "balance_loss_mlp": 1.05579519, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.06151916899658477, + "language_loss": 0.84067833, + "learning_rate": 0.00023964115830142025, + "loss": 0.85152954, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.29321289, + "step": 3556, + "time_per_iteration": 2.670454740524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086273, + "balance_loss_mlp": 1.05785322, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.07044194962998349, + "language_loss": 0.87372839, + "learning_rate": 0.00023937523617432522, + "loss": 0.8845911, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.28393555, + "step": 3557, + "time_per_iteration": 2.442620038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079062, + "balance_loss_mlp": 1.05073762, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.11887051887526623, + "language_loss": 0.86776745, + "learning_rate": 0.00023910941523073705, + "loss": 0.8785581, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.28320312, + "step": 3558, + "time_per_iteration": 3.9105570316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080627, + "balance_loss_mlp": 1.05211186, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05794224336416494, + "language_loss": 0.86635411, + "learning_rate": 0.0002388436955738566, + "loss": 0.87716037, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.28540039, + "step": 3559, + "time_per_iteration": 2.7885656356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010825, + "balance_loss_mlp": 1.05310321, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.06653025521174674, + "language_loss": 0.81589997, + "learning_rate": 0.00023857807730684523, + "loss": 0.82672501, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.29394531, + "step": 3560, + "time_per_iteration": 2.8988590240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082565, + "balance_loss_mlp": 1.05378819, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.07668578233950803, + "language_loss": 0.82023144, + "learning_rate": 0.00023831256053282547, + "loss": 0.83105713, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.2878418, + "step": 3561, + "time_per_iteration": 2.644080877304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_mlp": 1.05380273, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.07104594234153103, + "language_loss": 0.78454512, + "learning_rate": 0.00023804714535488003, + "loss": 0.79537451, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.29150391, + "step": 3562, + "time_per_iteration": 2.8966143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_mlp": 1.03124619, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.023182514695526305, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80852556, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.11669922, + "step": 3563, + "time_per_iteration": 4.932991027832031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078302, + "balance_loss_mlp": 1.04947758, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05956770996074772, + "language_loss": 0.8101843, + "learning_rate": 0.00023751662019934488, + "loss": 0.82096732, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2878418, + "step": 3564, + "time_per_iteration": 2.49049711227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080425, + "balance_loss_mlp": 1.05214906, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.05086931810535688, + "language_loss": 0.78869629, + "learning_rate": 0.00023725151042772364, + "loss": 0.79950058, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.28271484, + "step": 3565, + "time_per_iteration": 2.7470548152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079752, + "balance_loss_mlp": 1.04959226, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.07206608311036458, + "language_loss": 0.83451784, + "learning_rate": 0.00023698650266411276, + "loss": 0.8453154, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.30102539, + "step": 3566, + "time_per_iteration": 2.6310577392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079469, + "balance_loss_mlp": 1.04949975, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05434580355598899, + "language_loss": 0.83292013, + "learning_rate": 0.00023672159701139755, + "loss": 0.84371483, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.29931641, + "step": 3567, + "time_per_iteration": 3.2131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081005, + "balance_loss_mlp": 1.05160773, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.11905493017863943, + "language_loss": 0.8579241, + "learning_rate": 0.00023645679357242296, + "loss": 0.86873412, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.29370117, + "step": 3568, + "time_per_iteration": 2.536799192428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079259, + "balance_loss_mlp": 1.04881263, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.0572051056650869, + "language_loss": 0.83415657, + "learning_rate": 0.00023619209244999534, + "loss": 0.84494913, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.30395508, + "step": 3569, + "time_per_iteration": 2.6000583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071372, + "balance_loss_mlp": 1.0414027, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.07852810593031194, + "language_loss": 0.84651816, + "learning_rate": 0.0002359274937468806, + "loss": 0.85723186, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.29931641, + "step": 3570, + "time_per_iteration": 2.57413387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_mlp": 1.04479098, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.05388106388486604, + "language_loss": 0.77385354, + "learning_rate": 0.00023566299756580512, + "loss": 0.78460878, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.30688477, + "step": 3571, + "time_per_iteration": 2.6366066932678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.04491949, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.07115585873088184, + "language_loss": 0.78295314, + "learning_rate": 0.0002353986040094551, + "loss": 0.79371446, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.31176758, + "step": 3572, + "time_per_iteration": 2.503833532333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070084, + "balance_loss_mlp": 1.03882694, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.06984885351733894, + "language_loss": 0.79368085, + "learning_rate": 0.00023513431318047796, + "loss": 0.80438167, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.31225586, + "step": 3573, + "time_per_iteration": 2.568976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107429, + "balance_loss_mlp": 1.04293847, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.060417226210131056, + "language_loss": 0.76676512, + "learning_rate": 0.00023487012518147977, + "loss": 0.77750802, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.31323242, + "step": 3574, + "time_per_iteration": 3.229848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069454, + "balance_loss_mlp": 1.03836417, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.06028735388663287, + "language_loss": 0.84485316, + "learning_rate": 0.00023460604011502772, + "loss": 0.85554767, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.31054688, + "step": 3575, + "time_per_iteration": 3.6276612281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_mlp": 1.03640747, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.059284706265635014, + "language_loss": 0.85573983, + "learning_rate": 0.00023434205808364845, + "loss": 0.8664217, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.31762695, + "step": 3576, + "time_per_iteration": 3.154609203338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073627, + "balance_loss_mlp": 1.04146445, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.06862311945477588, + "language_loss": 0.85635597, + "learning_rate": 0.00023407817918982932, + "loss": 0.86709225, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.3215332, + "step": 3577, + "time_per_iteration": 2.770382881164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065226, + "balance_loss_mlp": 1.03480327, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05501523594648703, + "language_loss": 0.78652638, + "learning_rate": 0.00023381440353601718, + "loss": 0.79717863, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.30371094, + "step": 3578, + "time_per_iteration": 3.0038936138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068554, + "balance_loss_mlp": 1.03674912, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.07314782332090318, + "language_loss": 0.85671222, + "learning_rate": 0.00023355073122461822, + "loss": 0.86739773, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.31787109, + "step": 3579, + "time_per_iteration": 2.901097059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068864, + "balance_loss_mlp": 1.03798902, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05988205540841198, + "language_loss": 0.82838941, + "learning_rate": 0.00023328716235799973, + "loss": 0.83907801, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.30834961, + "step": 3580, + "time_per_iteration": 3.3144712448120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03734803, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.05209228569629584, + "language_loss": 0.83578706, + "learning_rate": 0.00023302369703848803, + "loss": 0.84647214, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.3112793, + "step": 3581, + "time_per_iteration": 2.7352983951568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072888, + "balance_loss_mlp": 1.04153562, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.06738914955836864, + "language_loss": 0.80107218, + "learning_rate": 0.00023276033536836937, + "loss": 0.81180108, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.31323242, + "step": 3582, + "time_per_iteration": 2.8315579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069685, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.07822330365866909, + "language_loss": 0.84485823, + "learning_rate": 0.00023249707744988984, + "loss": 0.85555506, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.31176758, + "step": 3583, + "time_per_iteration": 2.6693801879882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_mlp": 1.03927565, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.09035135761218806, + "language_loss": 0.82157326, + "learning_rate": 0.00023223392338525529, + "loss": 0.83227813, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.31176758, + "step": 3584, + "time_per_iteration": 2.6018331050872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071087, + "balance_loss_mlp": 1.03997374, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.07744993578546541, + "language_loss": 0.78292501, + "learning_rate": 0.00023197087327663107, + "loss": 0.79363585, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.31079102, + "step": 3585, + "time_per_iteration": 2.6550607681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073164, + "balance_loss_mlp": 1.04259896, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.06125478015545225, + "language_loss": 0.80901551, + "learning_rate": 0.00023170792722614243, + "loss": 0.81974715, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.30541992, + "step": 3586, + "time_per_iteration": 2.9460513591766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071475, + "balance_loss_mlp": 1.04057574, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05047941445610664, + "language_loss": 0.83664584, + "learning_rate": 0.00023144508533587377, + "loss": 0.84736061, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.30859375, + "step": 3587, + "time_per_iteration": 2.856055498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.04320216, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.06477764746614291, + "language_loss": 0.78527439, + "learning_rate": 0.0002311823477078698, + "loss": 0.796013, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.30615234, + "step": 3588, + "time_per_iteration": 3.003086805343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.04569197, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.08587382139418309, + "language_loss": 0.8476119, + "learning_rate": 0.00023091971444413428, + "loss": 0.85837138, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.30224609, + "step": 3589, + "time_per_iteration": 2.81282114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080015, + "balance_loss_mlp": 1.04909205, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.06247314370450002, + "language_loss": 0.82250512, + "learning_rate": 0.00023065718564663012, + "loss": 0.83330524, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.30883789, + "step": 3590, + "time_per_iteration": 2.7536580562591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_mlp": 1.02656031, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.017663884765429294, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74949831, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11669922, + "step": 3591, + "time_per_iteration": 4.997710704803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076225, + "balance_loss_mlp": 1.04732895, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.06051074258589463, + "language_loss": 0.80712819, + "learning_rate": 0.0002301324418579666, + "loss": 0.81789041, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.28881836, + "step": 3592, + "time_per_iteration": 2.6742522716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_mlp": 1.02309299, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.018187638305653092, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79723203, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.11621094, + "step": 3593, + "time_per_iteration": 4.769122123718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077785, + "balance_loss_mlp": 1.04865015, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.06768771188848043, + "language_loss": 0.80975646, + "learning_rate": 0.00022960811715677415, + "loss": 0.82053435, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.29101562, + "step": 3594, + "time_per_iteration": 2.8826262950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073934, + "balance_loss_mlp": 1.04472804, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.06319085560184597, + "language_loss": 0.81575662, + "learning_rate": 0.00022934611221845608, + "loss": 0.82649601, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.29150391, + "step": 3595, + "time_per_iteration": 2.8295226097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076251, + "balance_loss_mlp": 1.04663992, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.06812021191327418, + "language_loss": 0.7816391, + "learning_rate": 0.00022908421235729609, + "loss": 0.79240167, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.29589844, + "step": 3596, + "time_per_iteration": 2.6967883110046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072978, + "balance_loss_mlp": 1.04343832, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.05588162703096273, + "language_loss": 0.85190284, + "learning_rate": 0.0002288224176749728, + "loss": 0.86263263, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.29492188, + "step": 3597, + "time_per_iteration": 2.640408515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076769, + "balance_loss_mlp": 1.04775333, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0641823490668264, + "language_loss": 0.78313982, + "learning_rate": 0.00022856072827312385, + "loss": 0.79390752, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.28979492, + "step": 3598, + "time_per_iteration": 2.840587854385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.0432148, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.07324523845521881, + "language_loss": 0.76861233, + "learning_rate": 0.00022829914425334598, + "loss": 0.77933681, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.29223633, + "step": 3599, + "time_per_iteration": 2.6705574989318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068561, + "balance_loss_mlp": 1.03871107, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.06707330247170458, + "language_loss": 0.80270433, + "learning_rate": 0.0002280376657171956, + "loss": 0.8133899, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.2980957, + "step": 3600, + "time_per_iteration": 2.691218852996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070739, + "balance_loss_mlp": 1.04091287, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05961595039117338, + "language_loss": 0.76559889, + "learning_rate": 0.00022777629276618706, + "loss": 0.77630627, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.2980957, + "step": 3601, + "time_per_iteration": 3.166266679763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073223, + "balance_loss_mlp": 1.0433017, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05590734740319096, + "language_loss": 0.7759192, + "learning_rate": 0.0002275150255017947, + "loss": 0.78665143, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.29882812, + "step": 3602, + "time_per_iteration": 2.8251349925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018234, + "balance_loss_mlp": 1.00593138, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.021195340578823645, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76750904, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.12304688, + "step": 3603, + "time_per_iteration": 4.9793617725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015265, + "balance_loss_mlp": 1.00286758, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.02110962500083285, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76142371, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.12353516, + "step": 3604, + "time_per_iteration": 4.700538873672485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072674, + "balance_loss_mlp": 1.04251432, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.0788112373404933, + "language_loss": 0.8439424, + "learning_rate": 0.0002267318588424379, + "loss": 0.85466921, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.30151367, + "step": 3605, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067214, + "balance_loss_mlp": 1.03688753, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.060784014113104926, + "language_loss": 0.87543291, + "learning_rate": 0.00022647101533842845, + "loss": 0.88610506, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.30297852, + "step": 3606, + "time_per_iteration": 2.8924877643585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04255819, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.06196096561897257, + "language_loss": 0.76276547, + "learning_rate": 0.00022621027802778872, + "loss": 0.77349472, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.30322266, + "step": 3607, + "time_per_iteration": 2.625544309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064019, + "balance_loss_mlp": 1.03402638, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.05568531242453984, + "language_loss": 0.78539741, + "learning_rate": 0.00022594964701174586, + "loss": 0.79603761, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.29956055, + "step": 3608, + "time_per_iteration": 2.617882490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.04363918, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.06276821144872391, + "language_loss": 0.84534574, + "learning_rate": 0.00022568912239148586, + "loss": 0.8560816, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.29882812, + "step": 3609, + "time_per_iteration": 2.6177947521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068336, + "balance_loss_mlp": 1.03836668, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.056081647762310796, + "language_loss": 0.81555855, + "learning_rate": 0.00022542870426815344, + "loss": 0.82624191, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.29907227, + "step": 3610, + "time_per_iteration": 2.7079262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065817, + "balance_loss_mlp": 1.03646755, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.0593152321810988, + "language_loss": 0.85921854, + "learning_rate": 0.00022516839274285173, + "loss": 0.86987674, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.29321289, + "step": 3611, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068225, + "balance_loss_mlp": 1.03689671, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.07495855617451591, + "language_loss": 0.75130123, + "learning_rate": 0.00022490818791664265, + "loss": 0.76198351, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.31298828, + "step": 3612, + "time_per_iteration": 2.6149849891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067927, + "balance_loss_mlp": 1.03771973, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.05072032327743767, + "language_loss": 0.85225737, + "learning_rate": 0.00022464808989054676, + "loss": 0.86293662, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.30151367, + "step": 3613, + "time_per_iteration": 2.6458423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062852, + "balance_loss_mlp": 1.03331208, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.07224132209133893, + "language_loss": 0.76020145, + "learning_rate": 0.00022438809876554284, + "loss": 0.77082992, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.29516602, + "step": 3614, + "time_per_iteration": 2.6633236408233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106639, + "balance_loss_mlp": 1.03720808, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05675110425477687, + "language_loss": 0.80015868, + "learning_rate": 0.00022412821464256873, + "loss": 0.81082261, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.29174805, + "step": 3615, + "time_per_iteration": 2.726789712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063431, + "balance_loss_mlp": 1.03396273, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.06271109335257424, + "language_loss": 0.82397133, + "learning_rate": 0.00022386843762252023, + "loss": 0.83460569, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.29418945, + "step": 3616, + "time_per_iteration": 2.6123175621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106886, + "balance_loss_mlp": 1.03781807, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.06387852157141136, + "language_loss": 0.79405069, + "learning_rate": 0.00022360876780625193, + "loss": 0.8047393, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.31030273, + "step": 3617, + "time_per_iteration": 2.548015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_mlp": 1.03798532, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.0476690799196669, + "language_loss": 0.7988438, + "learning_rate": 0.00022334920529457604, + "loss": 0.80952054, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.296875, + "step": 3618, + "time_per_iteration": 2.899250030517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.0357945, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.054798101167174096, + "language_loss": 0.87429041, + "learning_rate": 0.00022308975018826423, + "loss": 0.88495374, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.30517578, + "step": 3619, + "time_per_iteration": 2.96332049369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04016924, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.06421164682139191, + "language_loss": 0.85025704, + "learning_rate": 0.00022283040258804564, + "loss": 0.86095744, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.29858398, + "step": 3620, + "time_per_iteration": 2.7818944454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067101, + "balance_loss_mlp": 1.03703606, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.06644285191513807, + "language_loss": 0.83246511, + "learning_rate": 0.00022257116259460802, + "loss": 0.84313607, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.30004883, + "step": 3621, + "time_per_iteration": 2.870532989501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068386, + "balance_loss_mlp": 1.03901291, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.06921875901681852, + "language_loss": 0.81326395, + "learning_rate": 0.00022231203030859725, + "loss": 0.82394779, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.29321289, + "step": 3622, + "time_per_iteration": 2.980616807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069183, + "balance_loss_mlp": 1.03923714, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06079999883636956, + "language_loss": 0.83173907, + "learning_rate": 0.00022205300583061737, + "loss": 0.84243095, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.29882812, + "step": 3623, + "time_per_iteration": 2.579345226287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_mlp": 1.02855718, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.01990235236243219, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83878684, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.11914062, + "step": 3624, + "time_per_iteration": 4.92698335647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106745, + "balance_loss_mlp": 1.03705204, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.06709425474580019, + "language_loss": 0.77051836, + "learning_rate": 0.00022153528070095735, + "loss": 0.7811929, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.3034668, + "step": 3625, + "time_per_iteration": 2.732236385345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072165, + "balance_loss_mlp": 1.04262519, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.06819853082306866, + "language_loss": 0.88156587, + "learning_rate": 0.00022127658025027568, + "loss": 0.89228755, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.29516602, + "step": 3626, + "time_per_iteration": 2.6894659996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_mlp": 1.04275477, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.06462671043275556, + "language_loss": 0.84997016, + "learning_rate": 0.00022101798800962258, + "loss": 0.8606984, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.30004883, + "step": 3627, + "time_per_iteration": 2.578765392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067981, + "balance_loss_mlp": 1.03732049, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.07388726632037217, + "language_loss": 0.7899543, + "learning_rate": 0.00022075950407939227, + "loss": 0.80063409, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.30639648, + "step": 3628, + "time_per_iteration": 2.615227699279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.04519582, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.07136749331855524, + "language_loss": 0.82724559, + "learning_rate": 0.0002205011285599367, + "loss": 0.83798957, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.29150391, + "step": 3629, + "time_per_iteration": 2.623537063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068631, + "balance_loss_mlp": 1.0383997, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.053682643938984226, + "language_loss": 0.80428958, + "learning_rate": 0.00022024286155156658, + "loss": 0.81497598, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.30224609, + "step": 3630, + "time_per_iteration": 2.8577961921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074555, + "balance_loss_mlp": 1.04472852, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.05341661710184385, + "language_loss": 0.85616398, + "learning_rate": 0.00021998470315454994, + "loss": 0.8669095, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.2980957, + "step": 3631, + "time_per_iteration": 2.6452653408050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.03902662, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.06182978984642289, + "language_loss": 0.86509019, + "learning_rate": 0.00021972665346911275, + "loss": 0.87577331, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.29296875, + "step": 3632, + "time_per_iteration": 2.7207632064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073072, + "balance_loss_mlp": 1.04400849, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05617398494873169, + "language_loss": 0.79707497, + "learning_rate": 0.00021946871259543877, + "loss": 0.80780566, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.29052734, + "step": 3633, + "time_per_iteration": 2.574397325515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073801, + "balance_loss_mlp": 1.04488051, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05654795894092567, + "language_loss": 0.83115089, + "learning_rate": 0.00021921088063366957, + "loss": 0.8418889, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.28930664, + "step": 3634, + "time_per_iteration": 2.9441816806793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.04452109, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05955924970323312, + "language_loss": 0.8162455, + "learning_rate": 0.00021895315768390435, + "loss": 0.82697725, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.28662109, + "step": 3635, + "time_per_iteration": 2.62445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04932475, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.054016227636185014, + "language_loss": 0.88036686, + "learning_rate": 0.00021869554384619999, + "loss": 0.89114523, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.28491211, + "step": 3636, + "time_per_iteration": 3.0029518604278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107865, + "balance_loss_mlp": 1.05037308, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.06391776997203466, + "language_loss": 0.80659258, + "learning_rate": 0.00021843803922057115, + "loss": 0.81737912, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.28295898, + "step": 3637, + "time_per_iteration": 2.7211790084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107883, + "balance_loss_mlp": 1.05110145, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.0662212795858457, + "language_loss": 0.81642038, + "learning_rate": 0.00021818064390698977, + "loss": 0.82720864, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.27758789, + "step": 3638, + "time_per_iteration": 2.5884149074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081934, + "balance_loss_mlp": 1.05303788, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.06374773426861974, + "language_loss": 0.86868232, + "learning_rate": 0.0002179233580053861, + "loss": 0.8795017, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.2890625, + "step": 3639, + "time_per_iteration": 2.753732681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076492, + "balance_loss_mlp": 1.04776227, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.059265612347706345, + "language_loss": 0.85829276, + "learning_rate": 0.00021766618161564688, + "loss": 0.86905766, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.28710938, + "step": 3640, + "time_per_iteration": 2.7745206356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.05575871, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.15690200420977896, + "language_loss": 0.87115562, + "learning_rate": 0.00021740911483761677, + "loss": 0.88199788, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.28417969, + "step": 3641, + "time_per_iteration": 2.563645362854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080559, + "balance_loss_mlp": 1.05292678, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.051778810892446146, + "language_loss": 0.92034602, + "learning_rate": 0.00021715215777109837, + "loss": 0.93115163, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.27685547, + "step": 3642, + "time_per_iteration": 2.9448609352111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082689, + "balance_loss_mlp": 1.05481815, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.0649670876424198, + "language_loss": 0.84332794, + "learning_rate": 0.00021689531051585103, + "loss": 0.85415483, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.27905273, + "step": 3643, + "time_per_iteration": 2.5947420597076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05185759, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.05881899099988506, + "language_loss": 0.80633974, + "learning_rate": 0.00021663857317159196, + "loss": 0.81714302, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.28466797, + "step": 3644, + "time_per_iteration": 2.6077582836151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.0568645, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05176536936587348, + "language_loss": 0.81858003, + "learning_rate": 0.00021638194583799487, + "loss": 0.82942665, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.27832031, + "step": 3645, + "time_per_iteration": 2.661813735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081277, + "balance_loss_mlp": 1.05335796, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.06125341159179279, + "language_loss": 0.82837009, + "learning_rate": 0.00021612542861469176, + "loss": 0.83918285, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.27954102, + "step": 3646, + "time_per_iteration": 3.218862771987915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086908, + "balance_loss_mlp": 1.05860782, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.06205257588419687, + "language_loss": 0.82430637, + "learning_rate": 0.00021586902160127135, + "loss": 0.83517551, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.28271484, + "step": 3647, + "time_per_iteration": 2.5945966243743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087682, + "balance_loss_mlp": 1.05938208, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07384041678105348, + "language_loss": 0.74226022, + "learning_rate": 0.00021561272489727974, + "loss": 0.75313699, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.28320312, + "step": 3648, + "time_per_iteration": 2.423347234725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.06241107, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.0540045704658738, + "language_loss": 0.80522048, + "learning_rate": 0.0002153565386022199, + "loss": 0.8161214, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.27734375, + "step": 3649, + "time_per_iteration": 2.634904623031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089135, + "balance_loss_mlp": 1.06112039, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.1599503630973746, + "language_loss": 0.8250525, + "learning_rate": 0.00021510046281555262, + "loss": 0.83594382, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.28027344, + "step": 3650, + "time_per_iteration": 2.824385643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087214, + "balance_loss_mlp": 1.05922353, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.06982952600277435, + "language_loss": 0.81099337, + "learning_rate": 0.0002148444976366949, + "loss": 0.82186544, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.27978516, + "step": 3651, + "time_per_iteration": 2.7480077743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06297851, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06340286287585739, + "language_loss": 0.82626015, + "learning_rate": 0.00021458864316502136, + "loss": 0.83716673, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.27734375, + "step": 3652, + "time_per_iteration": 2.699397087097168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085576, + "balance_loss_mlp": 1.0581578, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.06356802688225487, + "language_loss": 0.87087834, + "learning_rate": 0.0002143328994998634, + "loss": 0.88173407, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.2746582, + "step": 3653, + "time_per_iteration": 2.4910500049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108223, + "balance_loss_mlp": 1.05347681, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.1133092603860293, + "language_loss": 0.78451055, + "learning_rate": 0.00021407726674050982, + "loss": 0.79533285, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.28735352, + "step": 3654, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.0578599, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.054147023301355804, + "language_loss": 0.86789209, + "learning_rate": 0.0002138217449862061, + "loss": 0.87875628, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.28540039, + "step": 3655, + "time_per_iteration": 2.7385337352752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108677, + "balance_loss_mlp": 1.05932784, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.06738898601128132, + "language_loss": 0.78017962, + "learning_rate": 0.00021356633433615403, + "loss": 0.79104733, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.2746582, + "step": 3656, + "time_per_iteration": 2.5828328132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086039, + "balance_loss_mlp": 1.05778599, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.05385272242156959, + "language_loss": 0.83434522, + "learning_rate": 0.0002133110348895133, + "loss": 0.84520566, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.28271484, + "step": 3657, + "time_per_iteration": 2.978156805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081393, + "balance_loss_mlp": 1.05316448, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.05837559854624073, + "language_loss": 0.84898746, + "learning_rate": 0.0002130558467453999, + "loss": 0.85980141, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.28198242, + "step": 3658, + "time_per_iteration": 3.3442087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087911, + "balance_loss_mlp": 1.05875289, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.19942638133943547, + "language_loss": 0.84606349, + "learning_rate": 0.0002128007700028865, + "loss": 0.85694265, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.29125977, + "step": 3659, + "time_per_iteration": 2.742828607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088765, + "balance_loss_mlp": 1.06072712, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.06314927243304276, + "language_loss": 0.84402716, + "learning_rate": 0.00021254580476100276, + "loss": 0.85491478, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.28051758, + "step": 3660, + "time_per_iteration": 2.565272569656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087079, + "balance_loss_mlp": 1.0595659, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.06296941062799823, + "language_loss": 0.78639442, + "learning_rate": 0.00021229095111873497, + "loss": 0.79726517, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.27539062, + "step": 3661, + "time_per_iteration": 2.842556953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088789, + "balance_loss_mlp": 1.06072736, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.05444300541547984, + "language_loss": 0.86236918, + "learning_rate": 0.0002120362091750261, + "loss": 0.87325704, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.28100586, + "step": 3662, + "time_per_iteration": 2.810499668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05518591, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.0593931077751887, + "language_loss": 0.86978149, + "learning_rate": 0.00021178157902877566, + "loss": 0.88061064, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.27758789, + "step": 3663, + "time_per_iteration": 2.4574224948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092262, + "balance_loss_mlp": 1.06415284, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.0751363020635885, + "language_loss": 0.86745709, + "learning_rate": 0.0002115270607788397, + "loss": 0.87837976, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.28125, + "step": 3664, + "time_per_iteration": 2.7495899200439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087732, + "balance_loss_mlp": 1.05981338, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.07034018625942835, + "language_loss": 0.85685182, + "learning_rate": 0.00021127265452403133, + "loss": 0.86772919, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.27954102, + "step": 3665, + "time_per_iteration": 2.5029428005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_mlp": 1.03472269, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.01645523461712921, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85138083, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.1171875, + "step": 3666, + "time_per_iteration": 4.882653474807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.05729461, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.05492799595906871, + "language_loss": 0.82834661, + "learning_rate": 0.00021076417839483065, + "loss": 0.83919299, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27392578, + "step": 3667, + "time_per_iteration": 2.8046011924743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084673, + "balance_loss_mlp": 1.05622983, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.057239687513416834, + "language_loss": 0.84952044, + "learning_rate": 0.00021051010871784589, + "loss": 0.86036718, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.28442383, + "step": 3668, + "time_per_iteration": 2.547053098678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_mlp": 1.05634761, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.050223334888513216, + "language_loss": 0.78893518, + "learning_rate": 0.0002102561514308045, + "loss": 0.79978049, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.28173828, + "step": 3669, + "time_per_iteration": 2.752600908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081831, + "balance_loss_mlp": 1.05446088, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.06177474978046869, + "language_loss": 0.82231724, + "learning_rate": 0.00021000230663230135, + "loss": 0.8331356, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.27441406, + "step": 3670, + "time_per_iteration": 2.7295479774475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_mlp": 1.05213535, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.06597526409708185, + "language_loss": 0.82935393, + "learning_rate": 0.00020974857442088762, + "loss": 0.84015119, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.27612305, + "step": 3671, + "time_per_iteration": 2.6223764419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.05999768, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061832347037407955, + "language_loss": 0.88995802, + "learning_rate": 0.00020949495489507104, + "loss": 0.90083718, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.27954102, + "step": 3672, + "time_per_iteration": 2.6759605407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.0569576, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.08160392795168159, + "language_loss": 0.84611428, + "learning_rate": 0.00020924144815331525, + "loss": 0.85695612, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.27270508, + "step": 3673, + "time_per_iteration": 2.5533270835876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05991554, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.06771134911837604, + "language_loss": 0.8321439, + "learning_rate": 0.00020898805429404044, + "loss": 0.84301728, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.2746582, + "step": 3674, + "time_per_iteration": 2.6267168521881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086456, + "balance_loss_mlp": 1.05860853, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.074333129961205, + "language_loss": 0.78350407, + "learning_rate": 0.0002087347734156228, + "loss": 0.79436862, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.27880859, + "step": 3675, + "time_per_iteration": 2.879998207092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081334, + "balance_loss_mlp": 1.05415416, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.05100324832046891, + "language_loss": 0.79745239, + "learning_rate": 0.00020848160561639452, + "loss": 0.80826575, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.2722168, + "step": 3676, + "time_per_iteration": 2.6603164672851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084996, + "balance_loss_mlp": 1.05733955, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.054459225189570165, + "language_loss": 0.85905212, + "learning_rate": 0.0002082285509946445, + "loss": 0.86990213, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.27685547, + "step": 3677, + "time_per_iteration": 2.553056240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.05664098, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.062290106460759526, + "language_loss": 0.83324182, + "learning_rate": 0.00020797560964861683, + "loss": 0.84408498, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.27709961, + "step": 3678, + "time_per_iteration": 2.792145013809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087022, + "balance_loss_mlp": 1.05907917, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.06608494347958908, + "language_loss": 0.806409, + "learning_rate": 0.0002077227816765122, + "loss": 0.81727922, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.27954102, + "step": 3679, + "time_per_iteration": 4.414989709854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.03525627, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.01304969035368713, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77495277, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.12255859, + "step": 3680, + "time_per_iteration": 4.77666163444519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082723, + "balance_loss_mlp": 1.05544841, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.07037612396181211, + "language_loss": 0.7852788, + "learning_rate": 0.00020721746624665383, + "loss": 0.7961061, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.27319336, + "step": 3681, + "time_per_iteration": 2.7164971828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081164, + "balance_loss_mlp": 1.05338836, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.047491060798417466, + "language_loss": 0.80214369, + "learning_rate": 0.00020696497898508114, + "loss": 0.81295532, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.27807617, + "step": 3682, + "time_per_iteration": 3.0300755500793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.06165683, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.37225594130432843, + "language_loss": 0.77676904, + "learning_rate": 0.00020671260548979316, + "loss": 0.78766119, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.27587891, + "step": 3683, + "time_per_iteration": 3.0000338554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_mlp": 1.05715001, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.05966278900445413, + "language_loss": 0.84945965, + "learning_rate": 0.00020646034585876982, + "loss": 0.86030483, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.27441406, + "step": 3684, + "time_per_iteration": 2.8507392406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.05243671, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.050873107987967195, + "language_loss": 0.84335744, + "learning_rate": 0.00020620820018994718, + "loss": 0.85416293, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.28125, + "step": 3685, + "time_per_iteration": 2.8229713439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082628, + "balance_loss_mlp": 1.05385077, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.07162313361599233, + "language_loss": 0.82926023, + "learning_rate": 0.00020595616858121675, + "loss": 0.84008658, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2878418, + "step": 3686, + "time_per_iteration": 2.694638967514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079578, + "balance_loss_mlp": 1.05158722, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.06190114046391337, + "language_loss": 0.80535042, + "learning_rate": 0.00020570425113042586, + "loss": 0.81614614, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.28027344, + "step": 3687, + "time_per_iteration": 2.7041516304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.05074835, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.06733246833768769, + "language_loss": 0.85552853, + "learning_rate": 0.0002054524479353776, + "loss": 0.86632097, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.28540039, + "step": 3688, + "time_per_iteration": 2.6622695922851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079477, + "balance_loss_mlp": 1.05122447, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.09171480616774523, + "language_loss": 0.81669426, + "learning_rate": 0.00020520075909383063, + "loss": 0.82748902, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.28271484, + "step": 3689, + "time_per_iteration": 2.885802745819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085524, + "balance_loss_mlp": 1.05684257, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.058367776122323904, + "language_loss": 0.80585086, + "learning_rate": 0.00020494918470349916, + "loss": 0.81670618, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.28662109, + "step": 3690, + "time_per_iteration": 3.297044038772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078519, + "balance_loss_mlp": 1.05038536, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0682429606540151, + "language_loss": 0.85554057, + "learning_rate": 0.00020469772486205297, + "loss": 0.8663258, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.28149414, + "step": 3691, + "time_per_iteration": 2.602031707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082342, + "balance_loss_mlp": 1.05354142, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.05487079427914329, + "language_loss": 0.81415904, + "learning_rate": 0.0002044463796671177, + "loss": 0.82498252, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.2878418, + "step": 3692, + "time_per_iteration": 2.665280342102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086192, + "balance_loss_mlp": 1.05724823, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.06500857460791332, + "language_loss": 0.80369031, + "learning_rate": 0.00020419514921627408, + "loss": 0.81455219, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.28930664, + "step": 3693, + "time_per_iteration": 2.83823299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.05251122, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.05808556039270617, + "language_loss": 0.77408904, + "learning_rate": 0.00020394403360705855, + "loss": 0.78489405, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.2800293, + "step": 3694, + "time_per_iteration": 2.6939644813537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085807, + "balance_loss_mlp": 1.05569434, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.06287788377881579, + "language_loss": 0.87703514, + "learning_rate": 0.00020369303293696228, + "loss": 0.88789326, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.30078125, + "step": 3695, + "time_per_iteration": 2.588268995285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083208, + "balance_loss_mlp": 1.05474114, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.06448607356035771, + "language_loss": 0.78199911, + "learning_rate": 0.00020344214730343304, + "loss": 0.79283124, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.28466797, + "step": 3696, + "time_per_iteration": 2.6181139945983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073393, + "balance_loss_mlp": 1.04511678, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05437568169477665, + "language_loss": 0.79383552, + "learning_rate": 0.00020319137680387296, + "loss": 0.80456948, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.28271484, + "step": 3697, + "time_per_iteration": 2.925847291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077248, + "balance_loss_mlp": 1.04844677, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.07105325547979466, + "language_loss": 0.80237764, + "learning_rate": 0.0002029407215356398, + "loss": 0.81315017, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.28808594, + "step": 3698, + "time_per_iteration": 3.9760594367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077498, + "balance_loss_mlp": 1.04829144, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.06046542117195041, + "language_loss": 0.82863748, + "learning_rate": 0.00020269018159604663, + "loss": 0.83941245, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.29150391, + "step": 3699, + "time_per_iteration": 2.704861640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071741, + "balance_loss_mlp": 1.04336905, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.053095463302870675, + "language_loss": 0.818941, + "learning_rate": 0.00020243975708236162, + "loss": 0.82965839, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.28393555, + "step": 3700, + "time_per_iteration": 2.6019287109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010692, + "balance_loss_mlp": 1.0402801, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.06895358170102628, + "language_loss": 0.86096191, + "learning_rate": 0.00020218944809180818, + "loss": 0.87165391, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.2890625, + "step": 3701, + "time_per_iteration": 2.69789719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.0383426, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.048938239682891294, + "language_loss": 0.84783876, + "learning_rate": 0.00020193925472156493, + "loss": 0.85850537, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.28320312, + "step": 3702, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.04036713, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.026752885046143426, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75342035, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.125, + "step": 3703, + "time_per_iteration": 4.899750232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.0373385, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.05613195068078556, + "language_loss": 0.83530253, + "learning_rate": 0.00020143921523049863, + "loss": 0.84597135, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.29467773, + "step": 3704, + "time_per_iteration": 2.9570298194885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067522, + "balance_loss_mlp": 1.03860188, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.05853421015843179, + "language_loss": 0.83969504, + "learning_rate": 0.00020118936930380837, + "loss": 0.85037029, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.2890625, + "step": 3705, + "time_per_iteration": 2.750566005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068067, + "balance_loss_mlp": 1.03876543, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.07045372312262692, + "language_loss": 0.80809951, + "learning_rate": 0.0002009396393856932, + "loss": 0.81878018, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.29272461, + "step": 3706, + "time_per_iteration": 2.6755757331848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106429, + "balance_loss_mlp": 1.03560829, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.06196520847148758, + "language_loss": 0.82349885, + "learning_rate": 0.00020069002557310673, + "loss": 0.83414185, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.28662109, + "step": 3707, + "time_per_iteration": 2.737092971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_mlp": 1.03734505, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.06289073454443639, + "language_loss": 0.77148253, + "learning_rate": 0.00020044052796295807, + "loss": 0.78213924, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.28320312, + "step": 3708, + "time_per_iteration": 2.858578681945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066902, + "balance_loss_mlp": 1.03783917, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05709228954993964, + "language_loss": 0.8160665, + "learning_rate": 0.00020019114665211063, + "loss": 0.8267355, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.29052734, + "step": 3709, + "time_per_iteration": 2.6008872985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070493, + "balance_loss_mlp": 1.04128647, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.05827837383265674, + "language_loss": 0.81244481, + "learning_rate": 0.00019994188173738276, + "loss": 0.82314974, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.29174805, + "step": 3710, + "time_per_iteration": 2.6042001247406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068317, + "balance_loss_mlp": 1.03861022, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.056315014070009634, + "language_loss": 0.80933827, + "learning_rate": 0.0001996927333155477, + "loss": 0.82002145, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.29663086, + "step": 3711, + "time_per_iteration": 2.748624086380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010683, + "balance_loss_mlp": 1.03947508, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.061443099278046684, + "language_loss": 0.85405827, + "learning_rate": 0.00019944370148333346, + "loss": 0.86474121, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.2878418, + "step": 3712, + "time_per_iteration": 3.1557986736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.04316652, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.048833627959222234, + "language_loss": 0.79702485, + "learning_rate": 0.00019919478633742278, + "loss": 0.80774689, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.29052734, + "step": 3713, + "time_per_iteration": 2.667795419692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.04252636, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.0703082286681538, + "language_loss": 0.85178196, + "learning_rate": 0.00019894598797445302, + "loss": 0.86250067, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.29345703, + "step": 3714, + "time_per_iteration": 2.5345022678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107178, + "balance_loss_mlp": 1.04333699, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05625862990353456, + "language_loss": 0.8199116, + "learning_rate": 0.00019869730649101615, + "loss": 0.83062935, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.28417969, + "step": 3715, + "time_per_iteration": 2.8149824142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079135, + "balance_loss_mlp": 1.04988086, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.071816789410327, + "language_loss": 0.72405577, + "learning_rate": 0.00019844874198365943, + "loss": 0.73484713, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.29199219, + "step": 3716, + "time_per_iteration": 3.0852138996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069692, + "balance_loss_mlp": 1.04070067, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05756859715120925, + "language_loss": 0.83796489, + "learning_rate": 0.00019820029454888362, + "loss": 0.84866184, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.28979492, + "step": 3717, + "time_per_iteration": 2.7309763431549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_mlp": 1.01916921, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.017203742332568887, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75552928, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.125, + "step": 3718, + "time_per_iteration": 5.044423580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_mlp": 1.04777932, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.056277438983796696, + "language_loss": 0.79924434, + "learning_rate": 0.0001977037512828529, + "loss": 0.81001997, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.29760742, + "step": 3719, + "time_per_iteration": 2.5888805389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069135, + "balance_loss_mlp": 1.04059625, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.0550224121073684, + "language_loss": 0.86091673, + "learning_rate": 0.0001974556556443734, + "loss": 0.87160814, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.28540039, + "step": 3720, + "time_per_iteration": 2.7241830825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074341, + "balance_loss_mlp": 1.04575443, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.06173575943164377, + "language_loss": 0.88796955, + "learning_rate": 0.00019720767746402547, + "loss": 0.89871293, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.28564453, + "step": 3721, + "time_per_iteration": 2.721775770187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075436, + "balance_loss_mlp": 1.04725444, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.08488248506445442, + "language_loss": 0.79925454, + "learning_rate": 0.00019695981683808222, + "loss": 0.81000888, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.28173828, + "step": 3722, + "time_per_iteration": 2.7333226203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077529, + "balance_loss_mlp": 1.04989624, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.055390897958499746, + "language_loss": 0.85177088, + "learning_rate": 0.00019671207386277225, + "loss": 0.86254621, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.27636719, + "step": 3723, + "time_per_iteration": 2.924482583999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076762, + "balance_loss_mlp": 1.04800856, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06210467424192018, + "language_loss": 0.78391171, + "learning_rate": 0.0001964644486342777, + "loss": 0.79467928, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.28735352, + "step": 3724, + "time_per_iteration": 2.958444833755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04926085, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.0530875998345761, + "language_loss": 0.86440647, + "learning_rate": 0.00019621694124873524, + "loss": 0.87518001, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.28125, + "step": 3725, + "time_per_iteration": 2.6775362491607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_mlp": 1.02366674, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.0197496536520254, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77576053, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.12255859, + "step": 3726, + "time_per_iteration": 4.876794338226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079559, + "balance_loss_mlp": 1.05085373, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.05459811074333738, + "language_loss": 0.77077997, + "learning_rate": 0.00019572228039082428, + "loss": 0.78157556, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.28686523, + "step": 3727, + "time_per_iteration": 3.094959020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078104, + "balance_loss_mlp": 1.04982781, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.05087577266454216, + "language_loss": 0.83556503, + "learning_rate": 0.0001954751271105002, + "loss": 0.84634602, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.28295898, + "step": 3728, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077296, + "balance_loss_mlp": 1.04956806, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.058127871838067766, + "language_loss": 0.80794644, + "learning_rate": 0.00019522809205721687, + "loss": 0.81871945, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.27758789, + "step": 3729, + "time_per_iteration": 2.7567226886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070359, + "balance_loss_mlp": 1.0422256, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.06552906350513053, + "language_loss": 0.82629025, + "learning_rate": 0.0001949811753268816, + "loss": 0.83699387, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.28149414, + "step": 3730, + "time_per_iteration": 2.7015092372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074245, + "balance_loss_mlp": 1.04594445, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.0651237840260159, + "language_loss": 0.82088923, + "learning_rate": 0.00019473437701535634, + "loss": 0.83163166, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.28295898, + "step": 3731, + "time_per_iteration": 2.5865840911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04425454, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05867613657807477, + "language_loss": 0.89630008, + "learning_rate": 0.00019448769721845677, + "loss": 0.90702283, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.28051758, + "step": 3732, + "time_per_iteration": 2.800302743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073645, + "balance_loss_mlp": 1.04503512, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.07249060183275255, + "language_loss": 0.85536152, + "learning_rate": 0.00019424113603195203, + "loss": 0.86609799, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.28662109, + "step": 3733, + "time_per_iteration": 2.5308837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074406, + "balance_loss_mlp": 1.04589128, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.05588376049508018, + "language_loss": 0.80217636, + "learning_rate": 0.0001939946935515657, + "loss": 0.81292045, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.28515625, + "step": 3734, + "time_per_iteration": 2.8359925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077355, + "balance_loss_mlp": 1.04910207, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.0705810174200004, + "language_loss": 0.80242217, + "learning_rate": 0.0001937483698729755, + "loss": 0.81319571, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.28271484, + "step": 3735, + "time_per_iteration": 2.64072322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108005, + "balance_loss_mlp": 1.05070114, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.04976646958682061, + "language_loss": 0.81962895, + "learning_rate": 0.0001935021650918128, + "loss": 0.83042943, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.29321289, + "step": 3736, + "time_per_iteration": 3.0010826587677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04431319, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.062249035117782556, + "language_loss": 0.86910063, + "learning_rate": 0.0001932560793036625, + "loss": 0.87983465, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.29077148, + "step": 3737, + "time_per_iteration": 2.512890577316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.04766941, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.09579716691171304, + "language_loss": 0.86528683, + "learning_rate": 0.00019301011260406382, + "loss": 0.87604392, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.28051758, + "step": 3738, + "time_per_iteration": 2.624567985534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.04897833, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.050336885468814714, + "language_loss": 0.79622293, + "learning_rate": 0.00019276426508850936, + "loss": 0.80699408, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.28149414, + "step": 3739, + "time_per_iteration": 2.719663619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074558, + "balance_loss_mlp": 1.04597163, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.05223198929463843, + "language_loss": 0.80390334, + "learning_rate": 0.00019251853685244564, + "loss": 0.81464887, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.28564453, + "step": 3740, + "time_per_iteration": 3.006769895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076457, + "balance_loss_mlp": 1.048967, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.08129460448533303, + "language_loss": 0.80554307, + "learning_rate": 0.00019227292799127283, + "loss": 0.81630766, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.27539062, + "step": 3741, + "time_per_iteration": 3.0326223373413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.04560351, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.06791942956347788, + "language_loss": 0.78745782, + "learning_rate": 0.00019202743860034454, + "loss": 0.79819167, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.27807617, + "step": 3742, + "time_per_iteration": 3.2729034423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.04445601, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05486250950239536, + "language_loss": 0.83459806, + "learning_rate": 0.00019178206877496873, + "loss": 0.84531891, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.27636719, + "step": 3743, + "time_per_iteration": 2.7013559341430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070767, + "balance_loss_mlp": 1.04291999, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.04899238240269426, + "language_loss": 0.84932864, + "learning_rate": 0.0001915368186104059, + "loss": 0.86003625, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.27880859, + "step": 3744, + "time_per_iteration": 2.726893663406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073873, + "balance_loss_mlp": 1.04621649, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.06348773508617375, + "language_loss": 0.80724853, + "learning_rate": 0.0001912916882018706, + "loss": 0.81798726, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.27685547, + "step": 3745, + "time_per_iteration": 2.78125262260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073398, + "balance_loss_mlp": 1.0459559, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.06464144105655711, + "language_loss": 0.79121184, + "learning_rate": 0.00019104667764453125, + "loss": 0.80194581, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2746582, + "step": 3746, + "time_per_iteration": 3.033304214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.04549301, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.050415961986803856, + "language_loss": 0.80573905, + "learning_rate": 0.00019080178703350926, + "loss": 0.81646842, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.2746582, + "step": 3747, + "time_per_iteration": 2.6518349647521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074166, + "balance_loss_mlp": 1.0458895, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.07572692948457345, + "language_loss": 0.83004916, + "learning_rate": 0.00019055701646387952, + "loss": 0.84079087, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.28271484, + "step": 3748, + "time_per_iteration": 2.7013447284698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_mlp": 1.01970267, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.013786087553885988, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81504452, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.12060547, + "step": 3749, + "time_per_iteration": 4.794643878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.0453701, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05812194439124776, + "language_loss": 0.86448663, + "learning_rate": 0.00019006783582886368, + "loss": 0.87521857, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.27832031, + "step": 3750, + "time_per_iteration": 2.5275614261627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075263, + "balance_loss_mlp": 1.04653358, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.060767017514705764, + "language_loss": 0.82905239, + "learning_rate": 0.00018982342595339437, + "loss": 0.83980501, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.28686523, + "step": 3751, + "time_per_iteration": 3.522578239440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070907, + "balance_loss_mlp": 1.04239237, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05765271863237157, + "language_loss": 0.82075769, + "learning_rate": 0.00018957913649915076, + "loss": 0.83146673, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.28515625, + "step": 3752, + "time_per_iteration": 3.1765124797821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070534, + "balance_loss_mlp": 1.04187584, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.07973276687690374, + "language_loss": 0.79905254, + "learning_rate": 0.00018933496756097428, + "loss": 0.80975789, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.28662109, + "step": 3753, + "time_per_iteration": 2.625577926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074027, + "balance_loss_mlp": 1.04508317, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.06908288105531452, + "language_loss": 0.81582409, + "learning_rate": 0.0001890909192336603, + "loss": 0.82656443, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.28930664, + "step": 3754, + "time_per_iteration": 3.0871572494506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.04444289, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.057964315435078954, + "language_loss": 0.70292032, + "learning_rate": 0.00018884699161195623, + "loss": 0.71364796, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.28320312, + "step": 3755, + "time_per_iteration": 2.9729976654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072672, + "balance_loss_mlp": 1.0435853, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.07379868606686546, + "language_loss": 0.7706269, + "learning_rate": 0.00018860318479056327, + "loss": 0.78135359, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.29077148, + "step": 3756, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073497, + "balance_loss_mlp": 1.04491067, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.05587751331143294, + "language_loss": 0.83529603, + "learning_rate": 0.00018835949886413555, + "loss": 0.84603095, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.28588867, + "step": 3757, + "time_per_iteration": 2.6880505084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.04509711, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.08262826949591631, + "language_loss": 0.78295088, + "learning_rate": 0.0001881159339272806, + "loss": 0.7936939, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.29150391, + "step": 3758, + "time_per_iteration": 2.636491060256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04193068, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05735396724489517, + "language_loss": 0.78920448, + "learning_rate": 0.00018787249007455858, + "loss": 0.79990494, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.28100586, + "step": 3759, + "time_per_iteration": 2.5969340801239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_mlp": 1.04140496, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.07167982163737877, + "language_loss": 0.71580899, + "learning_rate": 0.00018762916740048302, + "loss": 0.72651339, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.28979492, + "step": 3760, + "time_per_iteration": 2.7852694988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071982, + "balance_loss_mlp": 1.04332376, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.05118431145994858, + "language_loss": 0.8598392, + "learning_rate": 0.0001873859659995195, + "loss": 0.87055904, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.28637695, + "step": 3761, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107496, + "balance_loss_mlp": 1.04639769, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.051413796044389046, + "language_loss": 0.83093852, + "learning_rate": 0.0001871428859660878, + "loss": 0.84168816, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.28564453, + "step": 3762, + "time_per_iteration": 2.7558627128601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107143, + "balance_loss_mlp": 1.04329658, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.057793734831364726, + "language_loss": 0.81882715, + "learning_rate": 0.00018689992739455975, + "loss": 0.82954144, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.28149414, + "step": 3763, + "time_per_iteration": 2.90240740776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070949, + "balance_loss_mlp": 1.04131389, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.047782863980039225, + "language_loss": 0.85763133, + "learning_rate": 0.00018665709037926027, + "loss": 0.86834085, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.29614258, + "step": 3764, + "time_per_iteration": 3.3121178150177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069943, + "balance_loss_mlp": 1.04157126, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06618029737842872, + "language_loss": 0.84513265, + "learning_rate": 0.00018641437501446694, + "loss": 0.8558321, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.28417969, + "step": 3765, + "time_per_iteration": 2.5711514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04172814, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.0702086558887849, + "language_loss": 0.82573164, + "learning_rate": 0.0001861717813944104, + "loss": 0.83643746, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.28833008, + "step": 3766, + "time_per_iteration": 2.6380386352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072686, + "balance_loss_mlp": 1.04386163, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.0720480056079547, + "language_loss": 0.79527569, + "learning_rate": 0.00018592930961327365, + "loss": 0.8060025, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.28833008, + "step": 3767, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.04238808, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.08594162637632567, + "language_loss": 0.87979633, + "learning_rate": 0.00018568695976519273, + "loss": 0.89051247, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.29199219, + "step": 3768, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072488, + "balance_loss_mlp": 1.04332972, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.06891867665937222, + "language_loss": 0.80339336, + "learning_rate": 0.00018544473194425593, + "loss": 0.81411815, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.29125977, + "step": 3769, + "time_per_iteration": 2.5053606033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.03942966, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.0628085761222727, + "language_loss": 0.78636301, + "learning_rate": 0.00018520262624450485, + "loss": 0.79704964, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.29174805, + "step": 3770, + "time_per_iteration": 2.8609566688537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073738, + "balance_loss_mlp": 1.0450325, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.04686882151976468, + "language_loss": 0.87040436, + "learning_rate": 0.00018496064275993324, + "loss": 0.88114178, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.28710938, + "step": 3771, + "time_per_iteration": 2.754624605178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067155, + "balance_loss_mlp": 1.03916478, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.06312025626452938, + "language_loss": 0.81491023, + "learning_rate": 0.00018471878158448686, + "loss": 0.82558179, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.2800293, + "step": 3772, + "time_per_iteration": 2.9370291233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074719, + "balance_loss_mlp": 1.04641891, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.04821073170159266, + "language_loss": 0.83998889, + "learning_rate": 0.00018447704281206512, + "loss": 0.85073608, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.28344727, + "step": 3773, + "time_per_iteration": 2.8460988998413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073582, + "balance_loss_mlp": 1.04382753, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.22097506803040057, + "language_loss": 0.82744718, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818305, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.29711914, + "step": 3774, + "time_per_iteration": 2.728426694869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107092, + "balance_loss_mlp": 1.04281068, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.06612065150918205, + "language_loss": 0.8084085, + "learning_rate": 0.0001839939328516526, + "loss": 0.81911772, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.28100586, + "step": 3775, + "time_per_iteration": 2.730315923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074711, + "balance_loss_mlp": 1.04631519, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.06548969982492862, + "language_loss": 0.81234205, + "learning_rate": 0.0001837525618512218, + "loss": 0.82308918, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.28369141, + "step": 3776, + "time_per_iteration": 2.8894991874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04159606, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.059408980610910087, + "language_loss": 0.8289094, + "learning_rate": 0.00018351131362893519, + "loss": 0.83960199, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.27685547, + "step": 3777, + "time_per_iteration": 2.829299211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072659, + "balance_loss_mlp": 1.04423952, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.07569647287253554, + "language_loss": 0.8052032, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592977, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.28417969, + "step": 3778, + "time_per_iteration": 2.605602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070858, + "balance_loss_mlp": 1.04279566, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.07105004265912586, + "language_loss": 0.87327212, + "learning_rate": 0.00018302918589339036, + "loss": 0.88398075, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.28051758, + "step": 3779, + "time_per_iteration": 2.644178628921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.04506147, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.05454287579555899, + "language_loss": 0.89820325, + "learning_rate": 0.00018278830656731054, + "loss": 0.90893972, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.28588867, + "step": 3780, + "time_per_iteration": 2.642853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067965, + "balance_loss_mlp": 1.03935504, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.049235223582258895, + "language_loss": 0.86383229, + "learning_rate": 0.00018254755039373222, + "loss": 0.87451196, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.28613281, + "step": 3781, + "time_per_iteration": 2.7858738899230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04377079, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.06238056381578398, + "language_loss": 0.8331604, + "learning_rate": 0.0001823069174661252, + "loss": 0.84388638, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.2878418, + "step": 3782, + "time_per_iteration": 2.7796318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075989, + "balance_loss_mlp": 1.0479033, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05705801102125677, + "language_loss": 0.78309739, + "learning_rate": 0.00018206640787791112, + "loss": 0.79385734, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.28125, + "step": 3783, + "time_per_iteration": 2.602808952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072706, + "balance_loss_mlp": 1.04411936, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.06294847174499694, + "language_loss": 0.85954249, + "learning_rate": 0.00018182602172246416, + "loss": 0.87026954, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.28588867, + "step": 3784, + "time_per_iteration": 2.6015853881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076895, + "balance_loss_mlp": 1.04823709, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06092859331592059, + "language_loss": 0.76170594, + "learning_rate": 0.00018158575909311075, + "loss": 0.77247488, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.28637695, + "step": 3785, + "time_per_iteration": 2.646030902862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05038452, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.06146036016272455, + "language_loss": 0.79553497, + "learning_rate": 0.000181345620083129, + "loss": 0.80632889, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.29003906, + "step": 3786, + "time_per_iteration": 2.792757034301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.0520606, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.04915125322890423, + "language_loss": 0.86502135, + "learning_rate": 0.00018110560478574927, + "loss": 0.87582016, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.27856445, + "step": 3787, + "time_per_iteration": 2.6800973415374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074424, + "balance_loss_mlp": 1.04538465, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.0704647078753348, + "language_loss": 0.80134165, + "learning_rate": 0.0001808657132941533, + "loss": 0.81208593, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.2902832, + "step": 3788, + "time_per_iteration": 2.770371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075695, + "balance_loss_mlp": 1.04741848, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.07634779758427546, + "language_loss": 0.8289668, + "learning_rate": 0.00018062594570147572, + "loss": 0.83972371, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.28295898, + "step": 3789, + "time_per_iteration": 2.5850260257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.05000448, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05162370165887138, + "language_loss": 0.85260105, + "learning_rate": 0.00018038630210080243, + "loss": 0.8633796, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.27880859, + "step": 3790, + "time_per_iteration": 2.837209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075748, + "balance_loss_mlp": 1.04744744, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.05876653681305703, + "language_loss": 0.849635, + "learning_rate": 0.0001801467825851712, + "loss": 0.86039245, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.28295898, + "step": 3791, + "time_per_iteration": 2.7689332962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.04778624, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.058290229022120006, + "language_loss": 0.7850548, + "learning_rate": 0.00017990738724757172, + "loss": 0.79581565, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.28320312, + "step": 3792, + "time_per_iteration": 2.870572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078653, + "balance_loss_mlp": 1.05092454, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.05184173418469221, + "language_loss": 0.81961739, + "learning_rate": 0.00017966811618094598, + "loss": 0.83040386, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.27758789, + "step": 3793, + "time_per_iteration": 2.9314723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078553, + "balance_loss_mlp": 1.05044341, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.061838028009129596, + "language_loss": 0.8480593, + "learning_rate": 0.00017942896947818664, + "loss": 0.85884488, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.28125, + "step": 3794, + "time_per_iteration": 2.5791871547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_mlp": 1.0351969, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.022620155773541276, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75872123, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.11865234, + "step": 3795, + "time_per_iteration": 4.875161647796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071538, + "balance_loss_mlp": 1.04383409, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07025171922085349, + "language_loss": 0.85040843, + "learning_rate": 0.00017895104953559947, + "loss": 0.8611238, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.27734375, + "step": 3796, + "time_per_iteration": 2.625335216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077716, + "balance_loss_mlp": 1.05027366, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.07017117998144913, + "language_loss": 0.89488584, + "learning_rate": 0.00017871227648131672, + "loss": 0.90566301, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.27490234, + "step": 3797, + "time_per_iteration": 2.4892690181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075327, + "balance_loss_mlp": 1.04743159, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.0555809148766967, + "language_loss": 0.82792765, + "learning_rate": 0.0001784736281619907, + "loss": 0.83868086, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.27905273, + "step": 3798, + "time_per_iteration": 2.616964101791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.04964578, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.06137974721906842, + "language_loss": 0.74274546, + "learning_rate": 0.00017823510467027232, + "loss": 0.75351775, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.27636719, + "step": 3799, + "time_per_iteration": 2.744365692138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074556, + "balance_loss_mlp": 1.04558766, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.06884438361049809, + "language_loss": 0.78208685, + "learning_rate": 0.00017799670609876516, + "loss": 0.79283237, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.28930664, + "step": 3800, + "time_per_iteration": 2.505571126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072835, + "balance_loss_mlp": 1.04465413, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.05034282557889911, + "language_loss": 0.88874984, + "learning_rate": 0.00017775843254002366, + "loss": 0.8994782, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.28222656, + "step": 3801, + "time_per_iteration": 2.7557313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076377, + "balance_loss_mlp": 1.04802942, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.053157012048244724, + "language_loss": 0.8399632, + "learning_rate": 0.00017752028408655367, + "loss": 0.85072702, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.28344727, + "step": 3802, + "time_per_iteration": 3.03664231300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074125, + "balance_loss_mlp": 1.04551435, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.05941466781290568, + "language_loss": 0.85240817, + "learning_rate": 0.00017728226083081272, + "loss": 0.8631494, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.28564453, + "step": 3803, + "time_per_iteration": 2.557260513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04554248, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.0569157917316084, + "language_loss": 0.8142879, + "learning_rate": 0.00017704436286520965, + "loss": 0.8250221, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.27929688, + "step": 3804, + "time_per_iteration": 2.531374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.04500246, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.0615002003094314, + "language_loss": 0.84243524, + "learning_rate": 0.0001768065902821046, + "loss": 0.85316658, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.28149414, + "step": 3805, + "time_per_iteration": 2.7219231128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070301, + "balance_loss_mlp": 1.04226291, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.050852375433721335, + "language_loss": 0.82159758, + "learning_rate": 0.00017656894317380907, + "loss": 0.83230054, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.28051758, + "step": 3806, + "time_per_iteration": 2.7360239028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019748, + "balance_loss_mlp": 1.00816071, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.009321700757662343, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77051014, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.11572266, + "step": 3807, + "time_per_iteration": 5.0339789390563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075379, + "balance_loss_mlp": 1.04662561, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.06770486672009031, + "language_loss": 0.83718252, + "learning_rate": 0.00017609402575064875, + "loss": 0.84793627, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.28710938, + "step": 3808, + "time_per_iteration": 2.5397021770477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073042, + "balance_loss_mlp": 1.04490852, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.07767281717141156, + "language_loss": 0.81099665, + "learning_rate": 0.00017585675562016367, + "loss": 0.8217271, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.28149414, + "step": 3809, + "time_per_iteration": 2.578652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019398, + "balance_loss_mlp": 1.00781119, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0100864336281573, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78232253, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.11572266, + "step": 3810, + "time_per_iteration": 4.869556903839111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04092479, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.16551466638387613, + "language_loss": 0.85115767, + "learning_rate": 0.00017538259298196474, + "loss": 0.861848, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.28100586, + "step": 3811, + "time_per_iteration": 2.5746755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074051, + "balance_loss_mlp": 1.04551268, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.05568772928725353, + "language_loss": 0.81749296, + "learning_rate": 0.00017514570065833745, + "loss": 0.82823348, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.28540039, + "step": 3812, + "time_per_iteration": 2.74574613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.04495704, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.06483425891488107, + "language_loss": 0.80511057, + "learning_rate": 0.00017490893445433426, + "loss": 0.81584549, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.28564453, + "step": 3813, + "time_per_iteration": 2.5976309776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.05026746, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.07334965322780891, + "language_loss": 0.81267703, + "learning_rate": 0.00017467229446187587, + "loss": 0.82346773, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.2878418, + "step": 3814, + "time_per_iteration": 2.6907997131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078482, + "balance_loss_mlp": 1.05044413, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.052639307044854956, + "language_loss": 0.81764507, + "learning_rate": 0.00017443578077283424, + "loss": 0.82842994, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.28027344, + "step": 3815, + "time_per_iteration": 2.65816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077176, + "balance_loss_mlp": 1.04882812, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.062049617931530306, + "language_loss": 0.84998393, + "learning_rate": 0.0001741993934790319, + "loss": 0.86075574, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.28344727, + "step": 3816, + "time_per_iteration": 2.738459348678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074176, + "balance_loss_mlp": 1.04594707, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.06367069815606033, + "language_loss": 0.8424527, + "learning_rate": 0.00017396313267224273, + "loss": 0.85319448, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.2824707, + "step": 3817, + "time_per_iteration": 2.7235686779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_mlp": 1.05144763, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.05690847114233298, + "language_loss": 0.88229644, + "learning_rate": 0.0001737269984441912, + "loss": 0.89309394, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.28320312, + "step": 3818, + "time_per_iteration": 2.664562225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_mlp": 1.05140162, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.059530599678457814, + "language_loss": 0.85132968, + "learning_rate": 0.00017349099088655263, + "loss": 0.86212027, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.27661133, + "step": 3819, + "time_per_iteration": 2.713716506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.05153477, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.07896802475478679, + "language_loss": 0.80594087, + "learning_rate": 0.00017325511009095375, + "loss": 0.81673896, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.28271484, + "step": 3820, + "time_per_iteration": 2.729605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075678, + "balance_loss_mlp": 1.04766417, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05267126362138293, + "language_loss": 0.83587992, + "learning_rate": 0.00017301935614897113, + "loss": 0.84663677, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.28051758, + "step": 3821, + "time_per_iteration": 2.6848647594451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.0488472, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.0534844061316339, + "language_loss": 0.81780893, + "learning_rate": 0.00017278372915213274, + "loss": 0.82857728, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.28027344, + "step": 3822, + "time_per_iteration": 2.650430679321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_mlp": 1.01945734, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.013429842271997025, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80925179, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.11865234, + "step": 3823, + "time_per_iteration": 4.986204385757446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_mlp": 1.05139256, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05755686388123544, + "language_loss": 0.80487376, + "learning_rate": 0.00017231285635975314, + "loss": 0.81566715, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.27929688, + "step": 3824, + "time_per_iteration": 2.952411413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107638, + "balance_loss_mlp": 1.04755485, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.0735633923389538, + "language_loss": 0.82809317, + "learning_rate": 0.00017207761074702115, + "loss": 0.83885694, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.28808594, + "step": 3825, + "time_per_iteration": 2.6093246936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05093431, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05450452025217221, + "language_loss": 0.83744037, + "learning_rate": 0.0001718424924450514, + "loss": 0.84824538, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.29516602, + "step": 3826, + "time_per_iteration": 2.625596046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072132, + "balance_loss_mlp": 1.04387975, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04900180424478287, + "language_loss": 0.85697591, + "learning_rate": 0.00017160750154512482, + "loss": 0.86769724, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.2824707, + "step": 3827, + "time_per_iteration": 4.115647554397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077067, + "balance_loss_mlp": 1.04912448, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.04912825481573526, + "language_loss": 0.83176559, + "learning_rate": 0.0001713726381384731, + "loss": 0.84253627, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.27954102, + "step": 3828, + "time_per_iteration": 2.794640302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04140913, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.06936682542859615, + "language_loss": 0.80874848, + "learning_rate": 0.00017113790231627812, + "loss": 0.81944889, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.28637695, + "step": 3829, + "time_per_iteration": 2.5032026767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_mlp": 1.01086962, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.00938038964712245, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80281258, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.12158203, + "step": 3830, + "time_per_iteration": 4.790278911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107551, + "balance_loss_mlp": 1.04701948, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05667126288905575, + "language_loss": 0.81707335, + "learning_rate": 0.00017066881378973936, + "loss": 0.82782841, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.28491211, + "step": 3831, + "time_per_iteration": 2.6234376430511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.0442524, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.05465479593854143, + "language_loss": 0.82744801, + "learning_rate": 0.00017043446126751189, + "loss": 0.83817565, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.28540039, + "step": 3832, + "time_per_iteration": 2.68343186378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.04089189, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.15091194873702685, + "language_loss": 0.76596999, + "learning_rate": 0.00017020023669397376, + "loss": 0.77666306, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.28442383, + "step": 3833, + "time_per_iteration": 2.709726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080144, + "balance_loss_mlp": 1.05141497, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.054777149599410456, + "language_loss": 0.81358391, + "learning_rate": 0.0001699661401600589, + "loss": 0.82438534, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.28759766, + "step": 3834, + "time_per_iteration": 2.5703024864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.04680145, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.05177646885601935, + "language_loss": 0.78090227, + "learning_rate": 0.00016973217175665205, + "loss": 0.79165161, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.28125, + "step": 3835, + "time_per_iteration": 2.567094564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_mlp": 1.02178645, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.015599325923103721, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8220011, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.12158203, + "step": 3836, + "time_per_iteration": 4.926120281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079166, + "balance_loss_mlp": 1.05046034, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.08209233600612638, + "language_loss": 0.83787167, + "learning_rate": 0.00016926461970465047, + "loss": 0.84866333, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.28710938, + "step": 3837, + "time_per_iteration": 2.8248865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.0512259, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.0447245395908081, + "language_loss": 0.84287, + "learning_rate": 0.00016903103623757516, + "loss": 0.85366273, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.28051758, + "step": 3838, + "time_per_iteration": 3.0732860565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04818845, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.060261467227696625, + "language_loss": 0.801202, + "learning_rate": 0.00016879758126404738, + "loss": 0.8119669, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.28295898, + "step": 3839, + "time_per_iteration": 2.6999428272247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081913, + "balance_loss_mlp": 1.05420828, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.0717530150127342, + "language_loss": 0.80011249, + "learning_rate": 0.00016856425487470216, + "loss": 0.81093156, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.27758789, + "step": 3840, + "time_per_iteration": 3.0798532962799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_mlp": 1.047153, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.06037669736072389, + "language_loss": 0.79319191, + "learning_rate": 0.00016833105716012486, + "loss": 0.80394864, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.28540039, + "step": 3841, + "time_per_iteration": 3.125180244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069813, + "balance_loss_mlp": 1.04144144, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.05821002881472178, + "language_loss": 0.84839195, + "learning_rate": 0.00016809798821085088, + "loss": 0.85909009, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.28344727, + "step": 3842, + "time_per_iteration": 2.9953746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.05303824, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.054657255359861566, + "language_loss": 0.89063728, + "learning_rate": 0.00016786504811736565, + "loss": 0.90145791, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.28979492, + "step": 3843, + "time_per_iteration": 2.7037930488586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077429, + "balance_loss_mlp": 1.04869962, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.06408695288095054, + "language_loss": 0.82701367, + "learning_rate": 0.00016763223697010442, + "loss": 0.83778793, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.28710938, + "step": 3844, + "time_per_iteration": 2.9637320041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107492, + "balance_loss_mlp": 1.0469532, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.05096747285284615, + "language_loss": 0.84036589, + "learning_rate": 0.00016739955485945256, + "loss": 0.85111511, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.2800293, + "step": 3845, + "time_per_iteration": 2.698608160018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255807, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.07070386524494449, + "language_loss": 0.85914421, + "learning_rate": 0.00016716700187574513, + "loss": 0.86985326, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.28369141, + "step": 3846, + "time_per_iteration": 2.686567544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075336, + "balance_loss_mlp": 1.04787064, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.09697778830761983, + "language_loss": 0.83608466, + "learning_rate": 0.0001669345781092675, + "loss": 0.846838, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.27490234, + "step": 3847, + "time_per_iteration": 2.705946445465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075753, + "balance_loss_mlp": 1.04742908, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.07758942034588075, + "language_loss": 0.87070894, + "learning_rate": 0.0001667022836502546, + "loss": 0.88146651, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.28320312, + "step": 3848, + "time_per_iteration": 2.727207899093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074969, + "balance_loss_mlp": 1.04657388, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.06324539449596041, + "language_loss": 0.82776666, + "learning_rate": 0.00016647011858889077, + "loss": 0.83851635, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.28369141, + "step": 3849, + "time_per_iteration": 2.552164077758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074172, + "balance_loss_mlp": 1.04577661, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.0765277016597007, + "language_loss": 0.86005962, + "learning_rate": 0.00016623808301531056, + "loss": 0.87080133, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.28417969, + "step": 3850, + "time_per_iteration": 2.6483278274536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073128, + "balance_loss_mlp": 1.04551888, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.06196174014296942, + "language_loss": 0.79140496, + "learning_rate": 0.00016600617701959842, + "loss": 0.8021363, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.27636719, + "step": 3851, + "time_per_iteration": 2.850390911102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_mlp": 1.01268303, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.012000469023036765, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79868609, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.12109375, + "step": 3852, + "time_per_iteration": 5.050019979476929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.08114806024349476, + "language_loss": 0.80909729, + "learning_rate": 0.00016554275412186315, + "loss": 0.8198458, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.28564453, + "step": 3853, + "time_per_iteration": 2.866884708404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04265463, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.09161546445880692, + "language_loss": 0.80530989, + "learning_rate": 0.0001653112373997568, + "loss": 0.8160221, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.28588867, + "step": 3854, + "time_per_iteration": 2.6828300952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075016, + "balance_loss_mlp": 1.04712129, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.06308625069628188, + "language_loss": 0.74284655, + "learning_rate": 0.0001650798506153517, + "loss": 0.75359672, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.27929688, + "step": 3855, + "time_per_iteration": 2.6935112476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.04473197, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.08209880324062359, + "language_loss": 0.84122801, + "learning_rate": 0.00016484859385848023, + "loss": 0.85195947, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.28442383, + "step": 3856, + "time_per_iteration": 2.620311975479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073651, + "balance_loss_mlp": 1.04501677, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.06689669498305581, + "language_loss": 0.76970744, + "learning_rate": 0.0001646174672189243, + "loss": 0.78044391, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.28613281, + "step": 3857, + "time_per_iteration": 2.6914920806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04087138, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.07125061218981377, + "language_loss": 0.80480021, + "learning_rate": 0.00016438647078641488, + "loss": 0.8154943, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.28515625, + "step": 3858, + "time_per_iteration": 2.6275553703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069955, + "balance_loss_mlp": 1.04103458, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.0650961492971168, + "language_loss": 0.83072245, + "learning_rate": 0.00016415560465063344, + "loss": 0.84142196, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.28930664, + "step": 3859, + "time_per_iteration": 2.732268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_mlp": 1.03886604, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07578384946449068, + "language_loss": 0.78930503, + "learning_rate": 0.0001639248689012095, + "loss": 0.79998553, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.29101562, + "step": 3860, + "time_per_iteration": 2.571627378463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_mlp": 1.04188704, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.06018469098837617, + "language_loss": 0.87730241, + "learning_rate": 0.00016369426362772271, + "loss": 0.88801575, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.29394531, + "step": 3861, + "time_per_iteration": 2.803495407104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04219532, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.05947124800099814, + "language_loss": 0.80541736, + "learning_rate": 0.00016346378891970233, + "loss": 0.81612754, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.28833008, + "step": 3862, + "time_per_iteration": 2.8671751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071209, + "balance_loss_mlp": 1.04183578, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.05726542490411253, + "language_loss": 0.80970359, + "learning_rate": 0.00016323344486662633, + "loss": 0.82041574, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.29345703, + "step": 3863, + "time_per_iteration": 3.310399055480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067129, + "balance_loss_mlp": 1.03808928, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05550567007056857, + "language_loss": 0.7837103, + "learning_rate": 0.00016300323155792247, + "loss": 0.79438156, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.29003906, + "step": 3864, + "time_per_iteration": 2.9007768630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065912, + "balance_loss_mlp": 1.03658676, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.0566624200483065, + "language_loss": 0.8859086, + "learning_rate": 0.00016277314908296687, + "loss": 0.8965677, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.29296875, + "step": 3865, + "time_per_iteration": 2.6249654293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.03741968, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.08514855435260649, + "language_loss": 0.76358485, + "learning_rate": 0.00016254319753108604, + "loss": 0.77424943, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.2902832, + "step": 3866, + "time_per_iteration": 2.816335678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070215, + "balance_loss_mlp": 1.04029381, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.06451588447838245, + "language_loss": 0.76624024, + "learning_rate": 0.00016231337699155492, + "loss": 0.77694237, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.29858398, + "step": 3867, + "time_per_iteration": 2.9624359607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068647, + "balance_loss_mlp": 1.03965509, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05724025816545972, + "language_loss": 0.78232771, + "learning_rate": 0.0001620836875535977, + "loss": 0.79301417, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.28930664, + "step": 3868, + "time_per_iteration": 2.847935199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064683, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.05959682093806377, + "language_loss": 0.8083024, + "learning_rate": 0.00016185412930638766, + "loss": 0.81894922, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.29614258, + "step": 3869, + "time_per_iteration": 2.8403937816619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066357, + "balance_loss_mlp": 1.03738952, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07528663769221765, + "language_loss": 0.82963836, + "learning_rate": 0.00016162470233904765, + "loss": 0.84030193, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.28955078, + "step": 3870, + "time_per_iteration": 2.7301175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_mlp": 1.03685129, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.055174574386506046, + "language_loss": 0.8203845, + "learning_rate": 0.00016139540674064856, + "loss": 0.83104366, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.2902832, + "step": 3871, + "time_per_iteration": 2.728790760040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070553, + "balance_loss_mlp": 1.0411799, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.05299342012379109, + "language_loss": 0.77625883, + "learning_rate": 0.00016116624260021113, + "loss": 0.78696442, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.29321289, + "step": 3872, + "time_per_iteration": 2.7653627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064492, + "balance_loss_mlp": 1.0351187, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.05882503001296847, + "language_loss": 0.8393743, + "learning_rate": 0.0001609372100067046, + "loss": 0.85001922, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.29345703, + "step": 3873, + "time_per_iteration": 2.556082010269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_mlp": 1.03318477, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.0629532265793869, + "language_loss": 0.84404862, + "learning_rate": 0.0001607083090490475, + "loss": 0.85467416, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.29296875, + "step": 3874, + "time_per_iteration": 2.8703696727752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_mlp": 1.0391767, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.07079518805711353, + "language_loss": 0.79695952, + "learning_rate": 0.00016047953981610714, + "loss": 0.80764002, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.28857422, + "step": 3875, + "time_per_iteration": 2.7114357948303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006736, + "balance_loss_mlp": 0.99467212, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.007120969619793637, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736375, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.12060547, + "step": 3876, + "time_per_iteration": 4.9630632400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.06112785741663116, + "language_loss": 0.81022239, + "learning_rate": 0.0001600223968795889, + "loss": 0.82083988, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.29394531, + "step": 3877, + "time_per_iteration": 2.8622119426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006979, + "balance_loss_mlp": 0.99501073, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.005911171092350221, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76703048, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.11962891, + "step": 3878, + "time_per_iteration": 4.92147159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064327, + "balance_loss_mlp": 1.03521585, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.0740832902187226, + "language_loss": 0.81523597, + "learning_rate": 0.00015956578190706483, + "loss": 0.82587922, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.29077148, + "step": 3879, + "time_per_iteration": 2.673748016357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_mlp": 1.03529429, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05926630999911606, + "language_loss": 0.75906825, + "learning_rate": 0.00015933767262892468, + "loss": 0.76971918, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.29760742, + "step": 3880, + "time_per_iteration": 2.7114145755767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069606, + "balance_loss_mlp": 1.03937459, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.07620522972756824, + "language_loss": 0.81981504, + "learning_rate": 0.00015910969560762927, + "loss": 0.83051109, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.30175781, + "step": 3881, + "time_per_iteration": 2.5965123176574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_mlp": 1.03790677, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05603078059754119, + "language_loss": 0.83325368, + "learning_rate": 0.00015888185093168727, + "loss": 0.84393334, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.30053711, + "step": 3882, + "time_per_iteration": 2.732828378677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03709519, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06025549136597994, + "language_loss": 0.8122552, + "learning_rate": 0.00015865413868955581, + "loss": 0.82292587, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.29931641, + "step": 3883, + "time_per_iteration": 2.6130521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03577161, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.0544206071008422, + "language_loss": 0.8260529, + "learning_rate": 0.00015842655896964054, + "loss": 0.83671433, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.30322266, + "step": 3884, + "time_per_iteration": 3.0686898231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03912604, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.07023161322090775, + "language_loss": 0.73560184, + "learning_rate": 0.00015819911186029567, + "loss": 0.7462911, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.29785156, + "step": 3885, + "time_per_iteration": 2.7895405292510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067979, + "balance_loss_mlp": 1.03808117, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.059238744927090525, + "language_loss": 0.86428809, + "learning_rate": 0.00015797179744982443, + "loss": 0.87496781, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.29833984, + "step": 3886, + "time_per_iteration": 2.7247395515441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_mlp": 1.03986931, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.04858811748134261, + "language_loss": 0.78711867, + "learning_rate": 0.00015774461582647765, + "loss": 0.79780704, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.28930664, + "step": 3887, + "time_per_iteration": 2.633619785308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066597, + "balance_loss_mlp": 1.0372951, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.06558254439957789, + "language_loss": 0.80900019, + "learning_rate": 0.00015751756707845505, + "loss": 0.81966615, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.29272461, + "step": 3888, + "time_per_iteration": 2.606644630432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.03703403, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.05503127509914209, + "language_loss": 0.88178474, + "learning_rate": 0.00015729065129390502, + "loss": 0.89244807, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.29296875, + "step": 3889, + "time_per_iteration": 2.997523784637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067289, + "balance_loss_mlp": 1.03891718, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.06469395023850445, + "language_loss": 0.82209432, + "learning_rate": 0.0001570638685609241, + "loss": 0.83276725, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.28369141, + "step": 3890, + "time_per_iteration": 2.569988250732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106895, + "balance_loss_mlp": 1.03950548, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.06811331087467534, + "language_loss": 0.80319339, + "learning_rate": 0.00015683721896755693, + "loss": 0.81388295, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.29443359, + "step": 3891, + "time_per_iteration": 2.5164339542388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026235, + "balance_loss_mlp": 1.01455247, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.016089611749753062, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.8323673, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.11669922, + "step": 3892, + "time_per_iteration": 4.94329047203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.05717636586120892, + "language_loss": 0.85079896, + "learning_rate": 0.00015638431955158528, + "loss": 0.86151218, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.28588867, + "step": 3893, + "time_per_iteration": 2.6895976066589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_mlp": 1.03823924, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.05490928633036113, + "language_loss": 0.80953169, + "learning_rate": 0.00015615806990481186, + "loss": 0.82020867, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.29394531, + "step": 3894, + "time_per_iteration": 2.7377114295959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066836, + "balance_loss_mlp": 1.03796339, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.04620973196436286, + "language_loss": 0.843225, + "learning_rate": 0.00015593195374931452, + "loss": 0.8538934, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.28808594, + "step": 3895, + "time_per_iteration": 2.7463459968566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066974, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.06172140758760985, + "language_loss": 0.79870188, + "learning_rate": 0.00015570597117287922, + "loss": 0.80937159, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.29125977, + "step": 3896, + "time_per_iteration": 2.698322057723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065177, + "balance_loss_mlp": 1.03585148, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.06184521079833043, + "language_loss": 0.77818131, + "learning_rate": 0.0001554801222632406, + "loss": 0.78883302, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.29296875, + "step": 3897, + "time_per_iteration": 2.5883569717407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_mlp": 1.03872728, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.05373326836284952, + "language_loss": 0.8491286, + "learning_rate": 0.00015525440710808052, + "loss": 0.85980201, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.28588867, + "step": 3898, + "time_per_iteration": 2.628744125366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063318, + "balance_loss_mlp": 1.03415978, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.060715179246677825, + "language_loss": 0.77859104, + "learning_rate": 0.00015502882579502953, + "loss": 0.78922421, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.29101562, + "step": 3899, + "time_per_iteration": 2.9461636543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106545, + "balance_loss_mlp": 1.03576672, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.04885018850646455, + "language_loss": 0.84403229, + "learning_rate": 0.00015480337841166592, + "loss": 0.85468674, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.29638672, + "step": 3900, + "time_per_iteration": 2.712470531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071559, + "balance_loss_mlp": 1.04287767, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.062426881340490126, + "language_loss": 0.83192408, + "learning_rate": 0.00015457806504551647, + "loss": 0.84263968, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.28686523, + "step": 3901, + "time_per_iteration": 2.8195760250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065, + "balance_loss_mlp": 1.0360322, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.11477974594715189, + "language_loss": 0.78299713, + "learning_rate": 0.0001543528857840554, + "loss": 0.79364717, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.28955078, + "step": 3902, + "time_per_iteration": 2.630005121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069882, + "balance_loss_mlp": 1.04155791, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.06709872205496833, + "language_loss": 0.80052483, + "learning_rate": 0.000154127840714705, + "loss": 0.81122363, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.28320312, + "step": 3903, + "time_per_iteration": 2.7631478309631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.03574109, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0656362631946546, + "language_loss": 0.81441653, + "learning_rate": 0.00015390292992483557, + "loss": 0.82506168, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.28759766, + "step": 3904, + "time_per_iteration": 2.5295097827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_mlp": 1.0401783, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.05357678642302426, + "language_loss": 0.84239411, + "learning_rate": 0.00015367815350176523, + "loss": 0.85308868, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.29223633, + "step": 3905, + "time_per_iteration": 2.774902582168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.03674817, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.052651193007747205, + "language_loss": 0.82780552, + "learning_rate": 0.00015345351153275987, + "loss": 0.83846122, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.28808594, + "step": 3906, + "time_per_iteration": 2.514157772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068641, + "balance_loss_mlp": 1.03964877, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05447043379457725, + "language_loss": 0.80753815, + "learning_rate": 0.00015322900410503332, + "loss": 0.81822455, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.28955078, + "step": 3907, + "time_per_iteration": 2.8011515140533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.04150474, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.13484252880290531, + "language_loss": 0.77137792, + "learning_rate": 0.00015300463130574703, + "loss": 0.78208047, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.28710938, + "step": 3908, + "time_per_iteration": 2.8607709407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068663, + "balance_loss_mlp": 1.03983819, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.04704882043674688, + "language_loss": 0.82268852, + "learning_rate": 0.00015278039322201033, + "loss": 0.8333751, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.28808594, + "step": 3909, + "time_per_iteration": 2.9650497436523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_mlp": 1.04047048, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.0655524275561889, + "language_loss": 0.79742765, + "learning_rate": 0.00015255628994088004, + "loss": 0.80810893, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.27685547, + "step": 3910, + "time_per_iteration": 2.5476014614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073189, + "balance_loss_mlp": 1.04410195, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.059223553783327845, + "language_loss": 0.74873102, + "learning_rate": 0.00015233232154936082, + "loss": 0.75946289, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.29101562, + "step": 3911, + "time_per_iteration": 3.244593858718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04204392, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.05757806259910298, + "language_loss": 0.76233411, + "learning_rate": 0.0001521084881344048, + "loss": 0.77303445, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.27978516, + "step": 3912, + "time_per_iteration": 2.874175548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.03988528, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.058305123662607664, + "language_loss": 0.8657366, + "learning_rate": 0.00015188478978291208, + "loss": 0.87642109, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.28564453, + "step": 3913, + "time_per_iteration": 2.76914119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072508, + "balance_loss_mlp": 1.04387414, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05696914319302461, + "language_loss": 0.8621434, + "learning_rate": 0.00015166122658173014, + "loss": 0.87286842, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.28637695, + "step": 3914, + "time_per_iteration": 2.7666819095611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069582, + "balance_loss_mlp": 1.04121017, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.05613078933144466, + "language_loss": 0.88230741, + "learning_rate": 0.00015143779861765332, + "loss": 0.89300323, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.28369141, + "step": 3915, + "time_per_iteration": 2.9440953731536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068792, + "balance_loss_mlp": 1.04058695, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.0540096565314657, + "language_loss": 0.81303173, + "learning_rate": 0.00015121450597742458, + "loss": 0.82371962, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.28198242, + "step": 3916, + "time_per_iteration": 2.8476526737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03871989, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.0625846652791648, + "language_loss": 0.78284335, + "learning_rate": 0.00015099134874773369, + "loss": 0.79351616, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.28613281, + "step": 3917, + "time_per_iteration": 2.7236275672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_mlp": 1.03791249, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.06623718225432344, + "language_loss": 0.80174196, + "learning_rate": 0.00015076832701521793, + "loss": 0.81240696, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.28588867, + "step": 3918, + "time_per_iteration": 2.7410969734191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_mlp": 1.04238045, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06658372042006708, + "language_loss": 0.81702781, + "learning_rate": 0.000150545440866462, + "loss": 0.82773727, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.28540039, + "step": 3919, + "time_per_iteration": 2.9761922359466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_mlp": 1.04143584, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.07410111643216553, + "language_loss": 0.78494799, + "learning_rate": 0.000150322690387998, + "loss": 0.79563987, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.27758789, + "step": 3920, + "time_per_iteration": 2.516460657119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.04316044, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05131276366098942, + "language_loss": 0.74961436, + "learning_rate": 0.00015010007566630535, + "loss": 0.76033103, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.28491211, + "step": 3921, + "time_per_iteration": 2.7329115867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071062, + "balance_loss_mlp": 1.04383469, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.07801712247115837, + "language_loss": 0.81558347, + "learning_rate": 0.00014987759678781077, + "loss": 0.82629412, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.27246094, + "step": 3922, + "time_per_iteration": 2.611708641052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071556, + "balance_loss_mlp": 1.04370856, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05153768257221068, + "language_loss": 0.82422328, + "learning_rate": 0.00014965525383888795, + "loss": 0.83493882, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.27856445, + "step": 3923, + "time_per_iteration": 2.7729198932647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072323, + "balance_loss_mlp": 1.04433274, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.0575234231525959, + "language_loss": 0.7209577, + "learning_rate": 0.00014943304690585851, + "loss": 0.73168093, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.2800293, + "step": 3924, + "time_per_iteration": 2.9442129135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071679, + "balance_loss_mlp": 1.04378402, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.07421500953939195, + "language_loss": 0.79421008, + "learning_rate": 0.0001492109760749908, + "loss": 0.80492687, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.27905273, + "step": 3925, + "time_per_iteration": 2.643162965774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071892, + "balance_loss_mlp": 1.04392564, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.059903848409534166, + "language_loss": 0.79955506, + "learning_rate": 0.00014898904143250002, + "loss": 0.81027395, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.27978516, + "step": 3926, + "time_per_iteration": 2.6683785915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013114, + "balance_loss_mlp": 1.00157464, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.014723160486699832, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76768315, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.11523438, + "step": 3927, + "time_per_iteration": 4.920205354690552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071852, + "balance_loss_mlp": 1.04331291, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.05563270173237852, + "language_loss": 0.80196631, + "learning_rate": 0.0001485455810572474, + "loss": 0.81268483, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.28540039, + "step": 3928, + "time_per_iteration": 2.6541106700897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073533, + "balance_loss_mlp": 1.04499388, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.04999178273670638, + "language_loss": 0.84088999, + "learning_rate": 0.00014832405549665236, + "loss": 0.85162532, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.28564453, + "step": 3929, + "time_per_iteration": 2.6799492835998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070648, + "balance_loss_mlp": 1.04203749, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.061253165396126415, + "language_loss": 0.78636932, + "learning_rate": 0.00014810266646876746, + "loss": 0.79707581, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.28613281, + "step": 3930, + "time_per_iteration": 2.7644495964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068729, + "balance_loss_mlp": 1.03980851, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.0768252646204266, + "language_loss": 0.77379584, + "learning_rate": 0.00014788141405954364, + "loss": 0.78448313, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.28930664, + "step": 3931, + "time_per_iteration": 2.996284246444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_mlp": 1.04418492, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.07792136157882237, + "language_loss": 0.84719956, + "learning_rate": 0.00014766029835487865, + "loss": 0.85792696, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.28564453, + "step": 3932, + "time_per_iteration": 2.7055630683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010723, + "balance_loss_mlp": 1.04326117, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.0830870815556461, + "language_loss": 0.79488772, + "learning_rate": 0.0001474393194406173, + "loss": 0.80561072, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.29052734, + "step": 3933, + "time_per_iteration": 2.8866286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04583836, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.06997934005865011, + "language_loss": 0.79262674, + "learning_rate": 0.00014721847740255112, + "loss": 0.80337715, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.29174805, + "step": 3934, + "time_per_iteration": 2.8177120685577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013149, + "balance_loss_mlp": 1.00151432, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.018539216642102736, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74925071, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.11621094, + "step": 3935, + "time_per_iteration": 4.663410186767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070009, + "balance_loss_mlp": 1.04085028, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08081636486404137, + "language_loss": 0.7884202, + "learning_rate": 0.00014677720429790526, + "loss": 0.79912031, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.29125977, + "step": 3936, + "time_per_iteration": 2.5801281929016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106807, + "balance_loss_mlp": 1.03791022, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.05183566311050574, + "language_loss": 0.8430894, + "learning_rate": 0.0001465567734026429, + "loss": 0.85377008, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.30126953, + "step": 3937, + "time_per_iteration": 2.711367607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.0420028, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.061048992240079196, + "language_loss": 0.82235777, + "learning_rate": 0.00014633647972621034, + "loss": 0.83307326, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.29492188, + "step": 3938, + "time_per_iteration": 2.4616081714630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_mlp": 1.03861201, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.05374365085178841, + "language_loss": 0.86112857, + "learning_rate": 0.00014611632335413354, + "loss": 0.87180108, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.28637695, + "step": 3939, + "time_per_iteration": 2.815455436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061142, + "balance_loss_mlp": 1.03296053, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05753060969911492, + "language_loss": 0.82291019, + "learning_rate": 0.00014589630437188456, + "loss": 0.8335216, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.28222656, + "step": 3940, + "time_per_iteration": 3.190596580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_mlp": 1.03698504, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.07206463977261317, + "language_loss": 0.78593653, + "learning_rate": 0.00014567642286488253, + "loss": 0.79659057, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.28466797, + "step": 3941, + "time_per_iteration": 2.5607380867004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04073191, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.06381401552287866, + "language_loss": 0.79120469, + "learning_rate": 0.00014545667891849258, + "loss": 0.80191505, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.30249023, + "step": 3942, + "time_per_iteration": 2.6117217540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04192472, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05226186971292142, + "language_loss": 0.82272542, + "learning_rate": 0.00014523707261802733, + "loss": 0.83343649, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.29174805, + "step": 3943, + "time_per_iteration": 2.665384292602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04448068, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.07358446075620559, + "language_loss": 0.81266546, + "learning_rate": 0.00014501760404874527, + "loss": 0.823394, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.28344727, + "step": 3944, + "time_per_iteration": 2.723860263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_mlp": 1.04698288, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.059139493232711386, + "language_loss": 0.85488701, + "learning_rate": 0.00014479827329585176, + "loss": 0.86564749, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.29052734, + "step": 3945, + "time_per_iteration": 2.6966402530670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070791, + "balance_loss_mlp": 1.04260945, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.05454852499248085, + "language_loss": 0.84753144, + "learning_rate": 0.00014457908044449846, + "loss": 0.85823941, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.28173828, + "step": 3946, + "time_per_iteration": 2.751542329788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_mlp": 1.0412122, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.057352771815407315, + "language_loss": 0.82947516, + "learning_rate": 0.00014436002557978371, + "loss": 0.84016603, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.27856445, + "step": 3947, + "time_per_iteration": 2.8199281692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_mlp": 1.0139817, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01569529231199887, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77667999, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.10888672, + "step": 3948, + "time_per_iteration": 4.886767387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071405, + "balance_loss_mlp": 1.04217458, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.052184618076363286, + "language_loss": 0.79761183, + "learning_rate": 0.0001439223301503945, + "loss": 0.80832583, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.29223633, + "step": 3949, + "time_per_iteration": 2.524615526199341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107458, + "balance_loss_mlp": 1.04644656, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.06319987538441409, + "language_loss": 0.76281846, + "learning_rate": 0.00014370368975564834, + "loss": 0.77356422, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.28112793, + "step": 3950, + "time_per_iteration": 2.9306294918060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073257, + "balance_loss_mlp": 1.045434, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.07868227598634299, + "language_loss": 0.83049744, + "learning_rate": 0.00014348518768739766, + "loss": 0.84123003, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.27832031, + "step": 3951, + "time_per_iteration": 2.7313663959503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_mlp": 1.01646149, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.015467940128204082, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77755326, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.10839844, + "step": 3952, + "time_per_iteration": 4.869096994400024 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107331, + "balance_loss_mlp": 1.04593956, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.05530347415553069, + "language_loss": 0.86385798, + "learning_rate": 0.00014304859886964867, + "loss": 0.87459111, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.27441406, + "step": 3953, + "time_per_iteration": 3.04145884513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.04591215, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.05036114884340379, + "language_loss": 0.83556843, + "learning_rate": 0.00014283051228964878, + "loss": 0.8463015, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.27416992, + "step": 3954, + "time_per_iteration": 2.694143772125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072565, + "balance_loss_mlp": 1.0455761, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.07332559246133831, + "language_loss": 0.82520175, + "learning_rate": 0.00014261256437514197, + "loss": 0.83592749, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.27026367, + "step": 3955, + "time_per_iteration": 2.644928455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081954, + "balance_loss_mlp": 1.05405927, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.0938811683144382, + "language_loss": 0.82110238, + "learning_rate": 0.0001423947552107428, + "loss": 0.83192188, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.27929688, + "step": 3956, + "time_per_iteration": 2.7390809059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.0495677, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.058156679645763765, + "language_loss": 0.77027428, + "learning_rate": 0.00014217708488101243, + "loss": 0.78105605, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.28637695, + "step": 3957, + "time_per_iteration": 3.068586587905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0505054, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.051838175229669575, + "language_loss": 0.76812273, + "learning_rate": 0.0001419595534704579, + "loss": 0.77891129, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.28369141, + "step": 3958, + "time_per_iteration": 2.6755166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078829, + "balance_loss_mlp": 1.05176806, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.08007848421566002, + "language_loss": 0.80974507, + "learning_rate": 0.00014174216106353237, + "loss": 0.82053339, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.27124023, + "step": 3959, + "time_per_iteration": 2.6076533794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077365, + "balance_loss_mlp": 1.04985189, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05778330536162942, + "language_loss": 0.75894332, + "learning_rate": 0.00014152490774463512, + "loss": 0.76971698, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.27539062, + "step": 3960, + "time_per_iteration": 2.690720558166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079227, + "balance_loss_mlp": 1.05211914, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07078023204432035, + "language_loss": 0.86778873, + "learning_rate": 0.00014130779359811135, + "loss": 0.87858105, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.27148438, + "step": 3961, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074576, + "balance_loss_mlp": 1.04672933, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.053637952879954945, + "language_loss": 0.85656244, + "learning_rate": 0.0001410908187082521, + "loss": 0.86730814, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.27856445, + "step": 3962, + "time_per_iteration": 2.8493921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073404, + "balance_loss_mlp": 1.04527116, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.06361910700745704, + "language_loss": 0.82962865, + "learning_rate": 0.0001408739831592949, + "loss": 0.84036273, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.28149414, + "step": 3963, + "time_per_iteration": 2.670091152191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04530358, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06318704886131189, + "language_loss": 0.77098757, + "learning_rate": 0.0001406572870354224, + "loss": 0.78171021, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.27001953, + "step": 3964, + "time_per_iteration": 2.8136370182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076051, + "balance_loss_mlp": 1.04758406, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.08123777777865493, + "language_loss": 0.87067986, + "learning_rate": 0.00014044073042076337, + "loss": 0.88144034, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.28491211, + "step": 3965, + "time_per_iteration": 2.601212739944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077118, + "balance_loss_mlp": 1.04948556, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.044562098322040423, + "language_loss": 0.88958192, + "learning_rate": 0.00014022431339939302, + "loss": 0.90035319, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.27636719, + "step": 3966, + "time_per_iteration": 2.6651570796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_mlp": 1.04119754, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.09228261412980937, + "language_loss": 0.77959037, + "learning_rate": 0.00014000803605533163, + "loss": 0.79028082, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.27856445, + "step": 3967, + "time_per_iteration": 2.8413825035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.04367566, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.08332228620070425, + "language_loss": 0.83150613, + "learning_rate": 0.00013979189847254553, + "loss": 0.8422159, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.27294922, + "step": 3968, + "time_per_iteration": 2.578245162963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071362, + "balance_loss_mlp": 1.04282331, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.06392054280336681, + "language_loss": 0.80515426, + "learning_rate": 0.00013957590073494674, + "loss": 0.8158679, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.28540039, + "step": 3969, + "time_per_iteration": 2.7899181842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069143, + "balance_loss_mlp": 1.04182076, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.08725250729100972, + "language_loss": 0.7866261, + "learning_rate": 0.0001393600429263931, + "loss": 0.7973175, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.2734375, + "step": 3970, + "time_per_iteration": 2.7429044246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010211, + "balance_loss_mlp": 0.99867129, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.0172148744606984, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75755095, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.11523438, + "step": 3971, + "time_per_iteration": 4.9502363204956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03834224, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05751268278268784, + "language_loss": 0.81411171, + "learning_rate": 0.0001389287474315804, + "loss": 0.8247757, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.28076172, + "step": 3972, + "time_per_iteration": 2.6566832065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_mlp": 1.04213631, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05008758615727923, + "language_loss": 0.8002165, + "learning_rate": 0.00013871330991276505, + "loss": 0.81092072, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.28295898, + "step": 3973, + "time_per_iteration": 2.7023086547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071476, + "balance_loss_mlp": 1.04334247, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.061481835950818894, + "language_loss": 0.80452615, + "learning_rate": 0.00013849801265788247, + "loss": 0.81524092, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.28149414, + "step": 3974, + "time_per_iteration": 2.997316837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067079, + "balance_loss_mlp": 1.03861213, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.07226378616877399, + "language_loss": 0.82833815, + "learning_rate": 0.00013828285575051818, + "loss": 0.83900893, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.28466797, + "step": 3975, + "time_per_iteration": 2.588979721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.03437066, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.06463560472951296, + "language_loss": 0.83791184, + "learning_rate": 0.0001380678392742035, + "loss": 0.84853303, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.27783203, + "step": 3976, + "time_per_iteration": 2.734581708908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061699, + "balance_loss_mlp": 1.03378069, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.05082413379641715, + "language_loss": 0.84568453, + "learning_rate": 0.00013785296331241526, + "loss": 0.85630155, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.27954102, + "step": 3977, + "time_per_iteration": 2.9020192623138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065476, + "balance_loss_mlp": 1.03727102, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.0974531570465959, + "language_loss": 0.86962479, + "learning_rate": 0.00013763822794857583, + "loss": 0.88027954, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.28222656, + "step": 3978, + "time_per_iteration": 3.2940611839294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066351, + "balance_loss_mlp": 1.03847969, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.06678664441020601, + "language_loss": 0.89705759, + "learning_rate": 0.00013742363326605278, + "loss": 0.9077211, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.27880859, + "step": 3979, + "time_per_iteration": 2.717656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064419, + "balance_loss_mlp": 1.03473556, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.10335635669358377, + "language_loss": 0.78531003, + "learning_rate": 0.00013720917934815935, + "loss": 0.79595423, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.296875, + "step": 3980, + "time_per_iteration": 2.7627711296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.03960097, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.07286561915101249, + "language_loss": 0.82861632, + "learning_rate": 0.00013699486627815344, + "loss": 0.83929539, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.28295898, + "step": 3981, + "time_per_iteration": 2.612478494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068523, + "balance_loss_mlp": 1.04024673, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.05570598750158071, + "language_loss": 0.82202697, + "learning_rate": 0.00013678069413923928, + "loss": 0.83271217, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.28320312, + "step": 3982, + "time_per_iteration": 2.586998701095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067122, + "balance_loss_mlp": 1.03844047, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.07121708811283338, + "language_loss": 0.81735259, + "learning_rate": 0.00013656666301456555, + "loss": 0.82802379, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.28662109, + "step": 3983, + "time_per_iteration": 2.574695587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066984, + "balance_loss_mlp": 1.03906524, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.055314975613937604, + "language_loss": 0.83996785, + "learning_rate": 0.0001363527729872267, + "loss": 0.85063773, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.27929688, + "step": 3984, + "time_per_iteration": 2.6829311847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069719, + "balance_loss_mlp": 1.04191911, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.061166263195475266, + "language_loss": 0.76441991, + "learning_rate": 0.00013613902414026207, + "loss": 0.77511704, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.27832031, + "step": 3985, + "time_per_iteration": 2.7802467346191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067154, + "balance_loss_mlp": 1.03947425, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.05402447635552578, + "language_loss": 0.82339627, + "learning_rate": 0.00013592541655665642, + "loss": 0.83406782, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.27709961, + "step": 3986, + "time_per_iteration": 2.9866812229156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255819, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.07328879507268711, + "language_loss": 0.85332406, + "learning_rate": 0.00013571195031933947, + "loss": 0.86403316, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.28320312, + "step": 3987, + "time_per_iteration": 2.673912525177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016637, + "balance_loss_mlp": 1.00533557, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.005208486185004438, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81497979, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.11279297, + "step": 3988, + "time_per_iteration": 4.698279619216919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_mlp": 1.04217589, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.06677874529098146, + "language_loss": 0.85441434, + "learning_rate": 0.00013528544221501655, + "loss": 0.86511409, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.27832031, + "step": 3989, + "time_per_iteration": 2.7262814044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079413, + "balance_loss_mlp": 1.05132711, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.06376913662917556, + "language_loss": 0.81445122, + "learning_rate": 0.00013507240051359586, + "loss": 0.82524538, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.28100586, + "step": 3990, + "time_per_iteration": 3.0680136680603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076429, + "balance_loss_mlp": 1.04944038, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.06248947721820998, + "language_loss": 0.85939497, + "learning_rate": 0.00013485950048963425, + "loss": 0.87015927, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.27026367, + "step": 3991, + "time_per_iteration": 2.652700424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.04629004, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.05838140649114419, + "language_loss": 0.82813108, + "learning_rate": 0.00013464674222578643, + "loss": 0.83886003, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.26660156, + "step": 3992, + "time_per_iteration": 3.199664354324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078059, + "balance_loss_mlp": 1.05028319, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.060819943301615054, + "language_loss": 0.8307544, + "learning_rate": 0.00013443412580465292, + "loss": 0.84153497, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.27783203, + "step": 3993, + "time_per_iteration": 2.6216468811035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077791, + "balance_loss_mlp": 1.04999137, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.05683440391019819, + "language_loss": 0.83944607, + "learning_rate": 0.00013422165130877857, + "loss": 0.85022402, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.27807617, + "step": 3994, + "time_per_iteration": 2.8932595252990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.05011749, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.058104534387139244, + "language_loss": 0.80272782, + "learning_rate": 0.00013400931882065327, + "loss": 0.81350249, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.27392578, + "step": 3995, + "time_per_iteration": 2.6307244300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.04768717, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.08323850441020555, + "language_loss": 0.80980253, + "learning_rate": 0.0001337971284227118, + "loss": 0.82056189, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.28222656, + "step": 3996, + "time_per_iteration": 3.022775411605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_mlp": 1.01415932, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.008597329334489423, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.7714355, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.10986328, + "step": 3997, + "time_per_iteration": 4.959140777587891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073396, + "balance_loss_mlp": 1.0458113, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05719845249799778, + "language_loss": 0.80268121, + "learning_rate": 0.0001333731742268438, + "loss": 0.81341517, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.27636719, + "step": 3998, + "time_per_iteration": 2.6925253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078887, + "balance_loss_mlp": 1.05142081, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05688018347037518, + "language_loss": 0.85395527, + "learning_rate": 0.0001331614105935109, + "loss": 0.86474419, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.27514648, + "step": 3999, + "time_per_iteration": 2.653233051300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076072, + "balance_loss_mlp": 1.04843915, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.05160358655207702, + "language_loss": 0.84470475, + "learning_rate": 0.00013294978937954883, + "loss": 0.85546547, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.27685547, + "step": 4000, + "time_per_iteration": 2.776451349258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073445, + "balance_loss_mlp": 1.04562187, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.08124921192431957, + "language_loss": 0.8516435, + "learning_rate": 0.00013273831066711655, + "loss": 0.862378, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.27856445, + "step": 4001, + "time_per_iteration": 2.624626874923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075558, + "balance_loss_mlp": 1.04903352, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.06596404445695028, + "language_loss": 0.79911482, + "learning_rate": 0.00013252697453831747, + "loss": 0.80987036, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.26574707, + "step": 4002, + "time_per_iteration": 2.714096784591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072085, + "balance_loss_mlp": 1.04480982, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05249171180112231, + "language_loss": 0.82409763, + "learning_rate": 0.00013231578107519916, + "loss": 0.83481848, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.27319336, + "step": 4003, + "time_per_iteration": 2.8834095001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073589, + "balance_loss_mlp": 1.04602814, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.06222122285204978, + "language_loss": 0.82945186, + "learning_rate": 0.00013210473035975422, + "loss": 0.84018773, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.27587891, + "step": 4004, + "time_per_iteration": 2.5676841735839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075243, + "balance_loss_mlp": 1.04756224, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.09382472586261968, + "language_loss": 0.85468185, + "learning_rate": 0.0001318938224739201, + "loss": 0.86543441, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.27734375, + "step": 4005, + "time_per_iteration": 3.032289743423462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072979, + "balance_loss_mlp": 1.04544115, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.05515917324758249, + "language_loss": 0.83841556, + "learning_rate": 0.00013168305749957843, + "loss": 0.84914535, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.27587891, + "step": 4006, + "time_per_iteration": 2.739898920059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074884, + "balance_loss_mlp": 1.04765701, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05387672734187661, + "language_loss": 0.8264026, + "learning_rate": 0.00013147243551855532, + "loss": 0.83715147, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.27270508, + "step": 4007, + "time_per_iteration": 2.5597212314605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071212, + "balance_loss_mlp": 1.04398441, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.05404718589625755, + "language_loss": 0.80486447, + "learning_rate": 0.00013126195661262148, + "loss": 0.81557661, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.27270508, + "step": 4008, + "time_per_iteration": 2.744112968444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.043365, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.04619128213129889, + "language_loss": 0.86330914, + "learning_rate": 0.00013105162086349216, + "loss": 0.87401342, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.27099609, + "step": 4009, + "time_per_iteration": 2.801823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.04526305, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.04727817553621032, + "language_loss": 0.86132288, + "learning_rate": 0.00013084142835282687, + "loss": 0.8720476, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.2722168, + "step": 4010, + "time_per_iteration": 2.6556901931762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020489, + "balance_loss_mlp": 1.00937891, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.005772893743499722, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80904853, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.11132812, + "step": 4011, + "time_per_iteration": 4.782922744750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04520464, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.05569724258515983, + "language_loss": 0.89507568, + "learning_rate": 0.0001304214733732485, + "loss": 0.90580624, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.27832031, + "step": 4012, + "time_per_iteration": 2.715064525604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.04758501, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.06797042537174566, + "language_loss": 0.82429183, + "learning_rate": 0.00013021171106737672, + "loss": 0.83504903, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.28125, + "step": 4013, + "time_per_iteration": 2.658712863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076283, + "balance_loss_mlp": 1.04867363, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05000868356723149, + "language_loss": 0.7937907, + "learning_rate": 0.00013000209232605071, + "loss": 0.80455357, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.27636719, + "step": 4014, + "time_per_iteration": 2.6712594032287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.04535961, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.05134661435861922, + "language_loss": 0.79622269, + "learning_rate": 0.0001297926172306519, + "loss": 0.80695289, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.27685547, + "step": 4015, + "time_per_iteration": 2.610283613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071717, + "balance_loss_mlp": 1.04420376, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.05687508890981391, + "language_loss": 0.78788078, + "learning_rate": 0.0001295832858625055, + "loss": 0.79859793, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.27539062, + "step": 4016, + "time_per_iteration": 3.2706351280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068156, + "balance_loss_mlp": 1.04054761, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.052610449581979135, + "language_loss": 0.69848269, + "learning_rate": 0.00012937409830288154, + "loss": 0.70916426, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.27636719, + "step": 4017, + "time_per_iteration": 2.8540306091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.04220808, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.0635987545876438, + "language_loss": 0.85103798, + "learning_rate": 0.00012916505463299362, + "loss": 0.86173213, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.27246094, + "step": 4018, + "time_per_iteration": 2.495150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070913, + "balance_loss_mlp": 1.0439713, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.05824058585066258, + "language_loss": 0.7791152, + "learning_rate": 0.00012895615493399972, + "loss": 0.78982437, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.26977539, + "step": 4019, + "time_per_iteration": 2.813354015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.04334593, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.0836786402257782, + "language_loss": 0.82400632, + "learning_rate": 0.00012874739928700192, + "loss": 0.83471084, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.27148438, + "step": 4020, + "time_per_iteration": 2.559576988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068211, + "balance_loss_mlp": 1.0395534, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.06159530150970634, + "language_loss": 0.79701376, + "learning_rate": 0.00012853878777304624, + "loss": 0.80769587, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.28662109, + "step": 4021, + "time_per_iteration": 2.8569796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072395, + "balance_loss_mlp": 1.04528701, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.052906319794948725, + "language_loss": 0.84479654, + "learning_rate": 0.000128330320473123, + "loss": 0.85552055, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.27172852, + "step": 4022, + "time_per_iteration": 2.715498208999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013154, + "balance_loss_mlp": 1.0020442, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.015943225392078396, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79345053, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.11132812, + "step": 4023, + "time_per_iteration": 4.888492107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073117, + "balance_loss_mlp": 1.04610443, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.06091537077025671, + "language_loss": 0.81350756, + "learning_rate": 0.0001279138188390543, + "loss": 0.82423878, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.27050781, + "step": 4024, + "time_per_iteration": 2.766850233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073924, + "balance_loss_mlp": 1.04622006, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05776515915351722, + "language_loss": 0.86359525, + "learning_rate": 0.00012770578466660915, + "loss": 0.87433445, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.27758789, + "step": 4025, + "time_per_iteration": 2.8906335830688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.04703164, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.05700523887714171, + "language_loss": 0.81593072, + "learning_rate": 0.0001274978950315968, + "loss": 0.82667857, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.27807617, + "step": 4026, + "time_per_iteration": 2.8301045894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.05058098, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.0689539575699981, + "language_loss": 0.82650018, + "learning_rate": 0.00012729015001472716, + "loss": 0.83727849, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.27258301, + "step": 4027, + "time_per_iteration": 2.6426851749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.04371142, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.05627311162483831, + "language_loss": 0.81452388, + "learning_rate": 0.00012708254969665418, + "loss": 0.82523495, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.27416992, + "step": 4028, + "time_per_iteration": 2.7853105068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107679, + "balance_loss_mlp": 1.04922891, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06575328123428556, + "language_loss": 0.83176428, + "learning_rate": 0.00012687509415797526, + "loss": 0.84253216, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.27587891, + "step": 4029, + "time_per_iteration": 2.5962271690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075768, + "balance_loss_mlp": 1.04808736, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.0626546531948414, + "language_loss": 0.81091148, + "learning_rate": 0.00012666778347923208, + "loss": 0.82166916, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.27709961, + "step": 4030, + "time_per_iteration": 2.647709369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078164, + "balance_loss_mlp": 1.04998243, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.044509265947171146, + "language_loss": 0.83753759, + "learning_rate": 0.0001264606177409092, + "loss": 0.84831923, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.28198242, + "step": 4031, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_mlp": 1.04437256, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.05920145784509139, + "language_loss": 0.85917544, + "learning_rate": 0.00012625359702343609, + "loss": 0.86988962, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.27075195, + "step": 4032, + "time_per_iteration": 2.7071335315704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107336, + "balance_loss_mlp": 1.04641843, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0993215607804505, + "language_loss": 0.84452856, + "learning_rate": 0.00012604672140718504, + "loss": 0.85526216, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.26965332, + "step": 4033, + "time_per_iteration": 2.6153743267059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075256, + "balance_loss_mlp": 1.04738498, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05917686409446163, + "language_loss": 0.77727896, + "learning_rate": 0.00012583999097247233, + "loss": 0.78803158, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.27905273, + "step": 4034, + "time_per_iteration": 2.876141309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.04836273, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.07262055787937163, + "language_loss": 0.80052263, + "learning_rate": 0.0001256334057995578, + "loss": 0.8112812, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.27514648, + "step": 4035, + "time_per_iteration": 2.7490179538726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072491, + "balance_loss_mlp": 1.04526329, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.050638434505961206, + "language_loss": 0.8468259, + "learning_rate": 0.000125426965968645, + "loss": 0.8575508, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.27294922, + "step": 4036, + "time_per_iteration": 2.7155818939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077213, + "balance_loss_mlp": 1.04967546, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.06589986489431957, + "language_loss": 0.82292032, + "learning_rate": 0.00012522067155988092, + "loss": 0.83369249, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.27563477, + "step": 4037, + "time_per_iteration": 2.712575912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072416, + "balance_loss_mlp": 1.0448314, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05822255331252486, + "language_loss": 0.75269878, + "learning_rate": 0.00012501452265335617, + "loss": 0.76342297, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.27612305, + "step": 4038, + "time_per_iteration": 2.8041534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04810321, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.05653078531335044, + "language_loss": 0.82581437, + "learning_rate": 0.0001248085193291047, + "loss": 0.83656931, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.27441406, + "step": 4039, + "time_per_iteration": 2.7838690280914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107999, + "balance_loss_mlp": 1.05230999, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.05606519790253506, + "language_loss": 0.82265162, + "learning_rate": 0.00012460266166710443, + "loss": 0.83345151, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.27734375, + "step": 4040, + "time_per_iteration": 3.1491823196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077311, + "balance_loss_mlp": 1.04989266, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.05703190402159479, + "language_loss": 0.77674973, + "learning_rate": 0.00012439694974727633, + "loss": 0.78752279, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.27441406, + "step": 4041, + "time_per_iteration": 3.0976173877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070951, + "balance_loss_mlp": 1.04358041, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.05364031630438029, + "language_loss": 0.80233228, + "learning_rate": 0.00012419138364948458, + "loss": 0.81304181, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.27392578, + "step": 4042, + "time_per_iteration": 2.7326791286468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070758, + "balance_loss_mlp": 1.04345894, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.0558907311125614, + "language_loss": 0.82470769, + "learning_rate": 0.00012398596345353702, + "loss": 0.83541524, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.2734375, + "step": 4043, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075612, + "balance_loss_mlp": 1.04824162, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.06132046127544376, + "language_loss": 0.83480489, + "learning_rate": 0.0001237806892391851, + "loss": 0.84556091, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.27416992, + "step": 4044, + "time_per_iteration": 2.7494754791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072105, + "balance_loss_mlp": 1.04540193, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05685464217024924, + "language_loss": 0.80689287, + "learning_rate": 0.0001235755610861233, + "loss": 0.81761396, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.26757812, + "step": 4045, + "time_per_iteration": 2.812063694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107437, + "balance_loss_mlp": 1.04711854, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.053935102157053175, + "language_loss": 0.85224533, + "learning_rate": 0.0001233705790739893, + "loss": 0.86298895, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.27270508, + "step": 4046, + "time_per_iteration": 2.7485461235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074247, + "balance_loss_mlp": 1.04697168, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.0673066847398555, + "language_loss": 0.74977076, + "learning_rate": 0.0001231657432823643, + "loss": 0.76051325, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.27319336, + "step": 4047, + "time_per_iteration": 3.1984071731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074697, + "balance_loss_mlp": 1.04661131, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.06151594222397662, + "language_loss": 0.78487623, + "learning_rate": 0.0001229610537907725, + "loss": 0.79562324, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.28100586, + "step": 4048, + "time_per_iteration": 2.6014962196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04379785, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.0760421254177005, + "language_loss": 0.90244645, + "learning_rate": 0.00012275651067868143, + "loss": 0.91317576, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.29077148, + "step": 4049, + "time_per_iteration": 2.598532199859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04196286, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.05867585212414032, + "language_loss": 0.80266809, + "learning_rate": 0.00012255211402550182, + "loss": 0.81336522, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.27807617, + "step": 4050, + "time_per_iteration": 3.223078727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070769, + "balance_loss_mlp": 1.04299307, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.07400928475776686, + "language_loss": 0.76817232, + "learning_rate": 0.00012234786391058727, + "loss": 0.77888, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.27783203, + "step": 4051, + "time_per_iteration": 4.367919683456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073785, + "balance_loss_mlp": 1.04565179, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.08184044182039507, + "language_loss": 0.84765863, + "learning_rate": 0.0001221437604132352, + "loss": 0.85839653, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.28149414, + "step": 4052, + "time_per_iteration": 2.619694948196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04369259, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.061094221003680546, + "language_loss": 0.81091797, + "learning_rate": 0.0001219398036126852, + "loss": 0.82162666, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.2722168, + "step": 4053, + "time_per_iteration": 2.7424631118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072046, + "balance_loss_mlp": 1.04391217, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.051190100857480304, + "language_loss": 0.77992457, + "learning_rate": 0.00012173599358812027, + "loss": 0.790645, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.28149414, + "step": 4054, + "time_per_iteration": 3.277557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070645, + "balance_loss_mlp": 1.04303575, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06092142653213725, + "language_loss": 0.82466495, + "learning_rate": 0.0001215323304186668, + "loss": 0.83537143, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.27587891, + "step": 4055, + "time_per_iteration": 2.7477025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074017, + "balance_loss_mlp": 1.0459547, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.06830093744875644, + "language_loss": 0.8764962, + "learning_rate": 0.00012132881418339364, + "loss": 0.88723636, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.28076172, + "step": 4056, + "time_per_iteration": 2.7418453693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009047, + "balance_loss_mlp": 0.99779409, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.016207473772952577, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7852661, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.11230469, + "step": 4057, + "time_per_iteration": 4.85454535484314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065459, + "balance_loss_mlp": 1.03661036, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.062259886670719244, + "language_loss": 0.77044684, + "learning_rate": 0.00012092222283137944, + "loss": 0.78110135, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.28833008, + "step": 4058, + "time_per_iteration": 2.764766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008333, + "balance_loss_mlp": 0.99707937, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.01618194632849119, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79914641, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.11230469, + "step": 4059, + "time_per_iteration": 4.825545310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069706, + "balance_loss_mlp": 1.0414772, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.07523837399490399, + "language_loss": 0.83462268, + "learning_rate": 0.00012051622016348856, + "loss": 0.84531975, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.2824707, + "step": 4060, + "time_per_iteration": 3.045809507369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_mlp": 1.04018903, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.06174241135408443, + "language_loss": 0.84242803, + "learning_rate": 0.00012031343978315539, + "loss": 0.85311675, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.28662109, + "step": 4061, + "time_per_iteration": 2.4845006465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_mlp": 1.04099798, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.1392477950837379, + "language_loss": 0.82486379, + "learning_rate": 0.00012011080681021774, + "loss": 0.83554912, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.27563477, + "step": 4062, + "time_per_iteration": 2.6524341106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04295421, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.07233679581194719, + "language_loss": 0.86375731, + "learning_rate": 0.00011990832132334512, + "loss": 0.87446582, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.27954102, + "step": 4063, + "time_per_iteration": 2.519162654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069214, + "balance_loss_mlp": 1.04112792, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.07068900898467687, + "language_loss": 0.82369703, + "learning_rate": 0.00011970598340114897, + "loss": 0.83438915, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.28100586, + "step": 4064, + "time_per_iteration": 2.9242045879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067385, + "balance_loss_mlp": 1.03875041, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.07366274029850052, + "language_loss": 0.83860916, + "learning_rate": 0.00011950379312218396, + "loss": 0.84928298, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.28637695, + "step": 4065, + "time_per_iteration": 2.7022647857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070383, + "balance_loss_mlp": 1.04191554, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.07812712198170087, + "language_loss": 0.86016601, + "learning_rate": 0.00011930175056494719, + "loss": 0.87086987, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.28466797, + "step": 4066, + "time_per_iteration": 2.885648488998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.04276156, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.0475815127648597, + "language_loss": 0.75548607, + "learning_rate": 0.00011909985580787885, + "loss": 0.76620239, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.28881836, + "step": 4067, + "time_per_iteration": 2.717013120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.0379895, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05385008636564137, + "language_loss": 0.80856502, + "learning_rate": 0.00011889810892936137, + "loss": 0.8192274, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.28295898, + "step": 4068, + "time_per_iteration": 2.7350502014160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.04105842, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.0661010913051719, + "language_loss": 0.77266741, + "learning_rate": 0.00011869651000771959, + "loss": 0.78335881, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.28100586, + "step": 4069, + "time_per_iteration": 2.8502442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065751, + "balance_loss_mlp": 1.03747416, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.06957531868653906, + "language_loss": 0.82841384, + "learning_rate": 0.00011849505912122117, + "loss": 0.83907133, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.28271484, + "step": 4070, + "time_per_iteration": 2.7242653369903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069163, + "balance_loss_mlp": 1.0401957, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.061542243963481506, + "language_loss": 0.77626544, + "learning_rate": 0.00011829375634807654, + "loss": 0.78695703, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.28955078, + "step": 4071, + "time_per_iteration": 3.18316650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03920245, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.06527363578820362, + "language_loss": 0.8108483, + "learning_rate": 0.00011809260176643821, + "loss": 0.82152736, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.28662109, + "step": 4072, + "time_per_iteration": 3.0564231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071131, + "balance_loss_mlp": 1.04318857, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.0688544484419534, + "language_loss": 0.83763361, + "learning_rate": 0.00011789159545440131, + "loss": 0.84834492, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.27978516, + "step": 4073, + "time_per_iteration": 2.6478123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070096, + "balance_loss_mlp": 1.04208159, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.05456504974378336, + "language_loss": 0.82081753, + "learning_rate": 0.00011769073749000348, + "loss": 0.83151847, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.2800293, + "step": 4074, + "time_per_iteration": 2.7911314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069906, + "balance_loss_mlp": 1.041749, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.07358433801147621, + "language_loss": 0.76115894, + "learning_rate": 0.0001174900279512246, + "loss": 0.77185798, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.28149414, + "step": 4075, + "time_per_iteration": 2.593980312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070027, + "balance_loss_mlp": 1.04110718, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.055342987139179775, + "language_loss": 0.81843507, + "learning_rate": 0.00011728946691598707, + "loss": 0.82913536, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2890625, + "step": 4076, + "time_per_iteration": 2.6213133335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067587, + "balance_loss_mlp": 1.03902483, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06016705026128457, + "language_loss": 0.76231396, + "learning_rate": 0.00011708905446215561, + "loss": 0.77298987, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.28540039, + "step": 4077, + "time_per_iteration": 2.89338755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04110491, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.052498050136505506, + "language_loss": 0.80255234, + "learning_rate": 0.00011688879066753711, + "loss": 0.81324947, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.28564453, + "step": 4078, + "time_per_iteration": 2.691178560256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.04007649, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.06922222458803326, + "language_loss": 0.87530267, + "learning_rate": 0.00011668867560988122, + "loss": 0.88597786, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.2746582, + "step": 4079, + "time_per_iteration": 2.5730109214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067247, + "balance_loss_mlp": 1.03870857, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.07036419305284744, + "language_loss": 0.84369481, + "learning_rate": 0.00011648870936687916, + "loss": 0.85436726, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.28540039, + "step": 4080, + "time_per_iteration": 2.763648271560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069029, + "balance_loss_mlp": 1.04053807, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07246870648451295, + "language_loss": 0.78439957, + "learning_rate": 0.00011628889201616461, + "loss": 0.79508984, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.28515625, + "step": 4081, + "time_per_iteration": 2.6238608360290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070508, + "balance_loss_mlp": 1.04208827, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.05558757362509338, + "language_loss": 0.81841099, + "learning_rate": 0.00011608922363531393, + "loss": 0.82911611, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.28417969, + "step": 4082, + "time_per_iteration": 2.6667022705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.04639971, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.07344619623899691, + "language_loss": 0.83384395, + "learning_rate": 0.00011588970430184504, + "loss": 0.84458899, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.28100586, + "step": 4083, + "time_per_iteration": 3.0444436073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069927, + "balance_loss_mlp": 1.04212761, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.045313213286836455, + "language_loss": 0.81620705, + "learning_rate": 0.00011569033409321822, + "loss": 0.82690632, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.27807617, + "step": 4084, + "time_per_iteration": 2.7107021808624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074024, + "balance_loss_mlp": 1.04605722, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.06179602249028764, + "language_loss": 0.73075098, + "learning_rate": 0.00011549111308683591, + "loss": 0.7414912, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.2800293, + "step": 4085, + "time_per_iteration": 2.674802780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04991984, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.06384285931580107, + "language_loss": 0.80674589, + "learning_rate": 0.00011529204136004251, + "loss": 0.8175236, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.27905273, + "step": 4086, + "time_per_iteration": 2.485450029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073353, + "balance_loss_mlp": 1.04600596, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.056474664391545235, + "language_loss": 0.84569514, + "learning_rate": 0.00011509311899012459, + "loss": 0.85642868, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.27392578, + "step": 4087, + "time_per_iteration": 2.6641156673431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072601, + "balance_loss_mlp": 1.04475415, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.09344860836240211, + "language_loss": 0.78010523, + "learning_rate": 0.00011489434605431053, + "loss": 0.79083121, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.27880859, + "step": 4088, + "time_per_iteration": 2.646610736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071704, + "balance_loss_mlp": 1.04390407, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.06168893422677419, + "language_loss": 0.81236577, + "learning_rate": 0.0001146957226297708, + "loss": 0.8230828, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.27807617, + "step": 4089, + "time_per_iteration": 2.7216711044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106999, + "balance_loss_mlp": 1.04147482, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.05015677705021027, + "language_loss": 0.76367462, + "learning_rate": 0.00011449724879361827, + "loss": 0.77437449, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.28515625, + "step": 4090, + "time_per_iteration": 2.9962027072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070207, + "balance_loss_mlp": 1.04212117, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.07758144969638558, + "language_loss": 0.73733866, + "learning_rate": 0.00011429892462290687, + "loss": 0.74804068, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.28100586, + "step": 4091, + "time_per_iteration": 2.7208704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071413, + "balance_loss_mlp": 1.04413819, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05584477685741542, + "language_loss": 0.83089757, + "learning_rate": 0.00011410075019463295, + "loss": 0.84161168, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.27319336, + "step": 4092, + "time_per_iteration": 2.608442544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04168272, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.05394381148222231, + "language_loss": 0.79899406, + "learning_rate": 0.00011390272558573461, + "loss": 0.80969799, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.28710938, + "step": 4093, + "time_per_iteration": 2.6670477390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070092, + "balance_loss_mlp": 1.04183984, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.04973668631858953, + "language_loss": 0.79517233, + "learning_rate": 0.00011370485087309202, + "loss": 0.80587327, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.2824707, + "step": 4094, + "time_per_iteration": 2.651747703552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107091, + "balance_loss_mlp": 1.04229987, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.05872791575225344, + "language_loss": 0.78693342, + "learning_rate": 0.00011350712613352688, + "loss": 0.79764247, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.28613281, + "step": 4095, + "time_per_iteration": 2.6549277305603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_mlp": 1.04072237, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.07961293490995022, + "language_loss": 0.79440165, + "learning_rate": 0.00011330955144380283, + "loss": 0.80509305, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.28417969, + "step": 4096, + "time_per_iteration": 2.6206085681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_mlp": 1.04217863, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.06633225025055933, + "language_loss": 0.86351848, + "learning_rate": 0.00011311212688062483, + "loss": 0.87421972, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.27929688, + "step": 4097, + "time_per_iteration": 2.781184673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069633, + "balance_loss_mlp": 1.0408082, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.07192838384326647, + "language_loss": 0.77839339, + "learning_rate": 0.0001129148525206402, + "loss": 0.78908968, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.28808594, + "step": 4098, + "time_per_iteration": 2.8173389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067449, + "balance_loss_mlp": 1.03931606, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.11237603320949716, + "language_loss": 0.86339819, + "learning_rate": 0.00011271772844043759, + "loss": 0.87407273, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.28125, + "step": 4099, + "time_per_iteration": 2.7524821758270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069791, + "balance_loss_mlp": 1.04127622, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.06946640589316219, + "language_loss": 0.75986981, + "learning_rate": 0.00011252075471654727, + "loss": 0.77056766, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.28515625, + "step": 4100, + "time_per_iteration": 2.947204351425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071355, + "balance_loss_mlp": 1.04262543, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.05611482280761958, + "language_loss": 0.7798807, + "learning_rate": 0.00011232393142544133, + "loss": 0.79059422, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.28710938, + "step": 4101, + "time_per_iteration": 2.95438551902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068821, + "balance_loss_mlp": 1.04037809, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.06028554523946094, + "language_loss": 0.83136284, + "learning_rate": 0.00011212725864353323, + "loss": 0.84205109, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.28417969, + "step": 4102, + "time_per_iteration": 3.067315101623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015622, + "balance_loss_mlp": 1.00370073, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.009770361918426226, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77351552, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.11914062, + "step": 4103, + "time_per_iteration": 4.903147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04016232, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06690395183564687, + "language_loss": 0.75603718, + "learning_rate": 0.00011173436491267291, + "loss": 0.76673061, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.29150391, + "step": 4104, + "time_per_iteration": 2.607632637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064374, + "balance_loss_mlp": 1.0360018, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.055969758992029287, + "language_loss": 0.81935525, + "learning_rate": 0.0001115381441162554, + "loss": 0.82999897, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.28393555, + "step": 4105, + "time_per_iteration": 2.6217761039733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014508, + "balance_loss_mlp": 1.00268257, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.0095479570502747, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74598229, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.11816406, + "step": 4106, + "time_per_iteration": 4.9060986042022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063636, + "balance_loss_mlp": 1.03524053, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.04917500811755106, + "language_loss": 0.84986818, + "learning_rate": 0.00011114615504234465, + "loss": 0.86050451, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.28393555, + "step": 4107, + "time_per_iteration": 2.760727882385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_mlp": 1.03931451, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.062643238447281, + "language_loss": 0.81024301, + "learning_rate": 0.00011095038691703468, + "loss": 0.82092702, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.29077148, + "step": 4108, + "time_per_iteration": 2.8416430950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_mlp": 1.03758597, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.059690498019966905, + "language_loss": 0.824301, + "learning_rate": 0.00011075476983417998, + "loss": 0.83495629, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.27978516, + "step": 4109, + "time_per_iteration": 2.879164695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.03742075, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.06625307097230863, + "language_loss": 0.77845091, + "learning_rate": 0.00011055930386972579, + "loss": 0.78911859, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.29272461, + "step": 4110, + "time_per_iteration": 2.8940486907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010668, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.05640022184839657, + "language_loss": 0.78389466, + "learning_rate": 0.00011036398909955863, + "loss": 0.79456264, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.29150391, + "step": 4111, + "time_per_iteration": 2.9704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03795147, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.05533152430131226, + "language_loss": 0.81315625, + "learning_rate": 0.00011016882559950648, + "loss": 0.82381761, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.28173828, + "step": 4112, + "time_per_iteration": 2.8546900749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064394, + "balance_loss_mlp": 1.03561699, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.06990273723133285, + "language_loss": 0.80328232, + "learning_rate": 0.00010997381344533853, + "loss": 0.81392628, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.28759766, + "step": 4113, + "time_per_iteration": 2.7969515323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069119, + "balance_loss_mlp": 1.04031801, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.061948681643476444, + "language_loss": 0.80212009, + "learning_rate": 0.00010977895271276517, + "loss": 0.81281132, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.28808594, + "step": 4114, + "time_per_iteration": 2.7396297454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064232, + "balance_loss_mlp": 1.03552604, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.06188955891536592, + "language_loss": 0.80402255, + "learning_rate": 0.00010958424347743807, + "loss": 0.8146649, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.28710938, + "step": 4115, + "time_per_iteration": 2.7420108318328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071293, + "balance_loss_mlp": 1.04337442, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.07461075198544243, + "language_loss": 0.80391407, + "learning_rate": 0.00010938968581494991, + "loss": 0.81462699, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.27929688, + "step": 4116, + "time_per_iteration": 2.941556692123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072505, + "balance_loss_mlp": 1.04418087, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.12071106309265658, + "language_loss": 0.78737396, + "learning_rate": 0.000109195279800835, + "loss": 0.79809904, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.28344727, + "step": 4117, + "time_per_iteration": 2.7312655448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.03901899, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.06211546650741466, + "language_loss": 0.76734632, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802026, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.28344727, + "step": 4118, + "time_per_iteration": 3.061748504638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.04590917, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05658815463494319, + "language_loss": 0.84763014, + "learning_rate": 0.00010880692301956601, + "loss": 0.85836887, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.27978516, + "step": 4119, + "time_per_iteration": 2.504396677017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.04241323, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.052435339334051444, + "language_loss": 0.85989153, + "learning_rate": 0.00010861297240318518, + "loss": 0.87059963, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.28393555, + "step": 4120, + "time_per_iteration": 2.851905584335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_mlp": 1.04296827, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.06531293240023527, + "language_loss": 0.86884111, + "learning_rate": 0.00010841917373672444, + "loss": 0.87954831, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.27783203, + "step": 4121, + "time_per_iteration": 2.72057843208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04561055, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.0659209843425975, + "language_loss": 0.78515911, + "learning_rate": 0.00010822552709542293, + "loss": 0.7959013, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.28588867, + "step": 4122, + "time_per_iteration": 2.8345208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068379, + "balance_loss_mlp": 1.04067445, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.053977644004353675, + "language_loss": 0.86079139, + "learning_rate": 0.0001080320325544612, + "loss": 0.87147516, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.27734375, + "step": 4123, + "time_per_iteration": 2.734748601913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073545, + "balance_loss_mlp": 1.04591262, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.05342076952837262, + "language_loss": 0.82945108, + "learning_rate": 0.00010783869018895997, + "loss": 0.84018654, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.27661133, + "step": 4124, + "time_per_iteration": 2.5848159790039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071922, + "balance_loss_mlp": 1.04438472, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05760976665940277, + "language_loss": 0.84397703, + "learning_rate": 0.00010764550007398189, + "loss": 0.85469627, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.27563477, + "step": 4125, + "time_per_iteration": 2.613123655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076104, + "balance_loss_mlp": 1.04797053, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.05267738869669298, + "language_loss": 0.81016707, + "learning_rate": 0.00010745246228452982, + "loss": 0.82092816, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.28173828, + "step": 4126, + "time_per_iteration": 2.5770304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.04460263, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.053184738741740976, + "language_loss": 0.8170619, + "learning_rate": 0.00010725957689554771, + "loss": 0.82778513, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.27734375, + "step": 4127, + "time_per_iteration": 2.774044990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073736, + "balance_loss_mlp": 1.04579329, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.047011204892956564, + "language_loss": 0.84647489, + "learning_rate": 0.00010706684398192013, + "loss": 0.85721219, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.27978516, + "step": 4128, + "time_per_iteration": 2.74668025970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070127, + "balance_loss_mlp": 1.0423516, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.061789852182866596, + "language_loss": 0.82038182, + "learning_rate": 0.00010687426361847313, + "loss": 0.83108312, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.27807617, + "step": 4129, + "time_per_iteration": 2.7684710025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075571, + "balance_loss_mlp": 1.04777122, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.056918102150188964, + "language_loss": 0.85627353, + "learning_rate": 0.00010668183587997254, + "loss": 0.86702919, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.27807617, + "step": 4130, + "time_per_iteration": 2.6196768283843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071124, + "balance_loss_mlp": 1.04289508, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.052989144266830976, + "language_loss": 0.77423567, + "learning_rate": 0.0001064895608411256, + "loss": 0.78494692, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.28222656, + "step": 4131, + "time_per_iteration": 2.822084903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.04275465, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.05398038812171178, + "language_loss": 0.80283594, + "learning_rate": 0.00010629743857657998, + "loss": 0.81354314, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.27954102, + "step": 4132, + "time_per_iteration": 2.9548959732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018993, + "balance_loss_mlp": 1.00807393, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.012201686903541073, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71617663, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.109375, + "step": 4133, + "time_per_iteration": 4.596825122833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077524, + "balance_loss_mlp": 1.04950953, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.1291273106507343, + "language_loss": 0.82121062, + "learning_rate": 0.00010591365266868802, + "loss": 0.83198583, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.28027344, + "step": 4134, + "time_per_iteration": 2.997457981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019784, + "balance_loss_mlp": 1.00886476, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.01121858900173578, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76531565, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.109375, + "step": 4135, + "time_per_iteration": 4.933257818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.0421412, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.07786925051397248, + "language_loss": 0.78780544, + "learning_rate": 0.00010553047875229166, + "loss": 0.7985087, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.28198242, + "step": 4136, + "time_per_iteration": 2.5145680904388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072473, + "balance_loss_mlp": 1.04522216, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.08712242528713769, + "language_loss": 0.83510804, + "learning_rate": 0.00010533912147689328, + "loss": 0.84583282, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.27270508, + "step": 4137, + "time_per_iteration": 2.6298136711120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_mlp": 1.04814243, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.06714788693393858, + "language_loss": 0.82280171, + "learning_rate": 0.00010514791742243656, + "loss": 0.83356392, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.28100586, + "step": 4138, + "time_per_iteration": 2.5997424125671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073026, + "balance_loss_mlp": 1.04553676, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.06696972519058896, + "language_loss": 0.82444674, + "learning_rate": 0.00010495686666315341, + "loss": 0.83517706, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.27514648, + "step": 4139, + "time_per_iteration": 2.8953542709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074691, + "balance_loss_mlp": 1.04662871, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.07236671578874358, + "language_loss": 0.77130395, + "learning_rate": 0.00010476596927321635, + "loss": 0.78205085, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.28076172, + "step": 4140, + "time_per_iteration": 2.6313490867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04591274, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.07734927138109192, + "language_loss": 0.80230534, + "learning_rate": 0.00010457522532673835, + "loss": 0.81304598, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.28173828, + "step": 4141, + "time_per_iteration": 2.8211119174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.0459199, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.05569229872202348, + "language_loss": 0.83232534, + "learning_rate": 0.00010438463489777272, + "loss": 0.84306371, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.27954102, + "step": 4142, + "time_per_iteration": 2.6115970611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074531, + "balance_loss_mlp": 1.04665971, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06331690376736109, + "language_loss": 0.77703011, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877754, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.27880859, + "step": 4143, + "time_per_iteration": 2.7046220302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074634, + "balance_loss_mlp": 1.04664397, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.04909390704775502, + "language_loss": 0.83792174, + "learning_rate": 0.00010400391488829403, + "loss": 0.8486681, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.2800293, + "step": 4144, + "time_per_iteration": 2.790830612182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.04788637, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.05483263194538034, + "language_loss": 0.86199546, + "learning_rate": 0.00010381378545558984, + "loss": 0.87275642, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.2824707, + "step": 4145, + "time_per_iteration": 2.7284913063049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069763, + "balance_loss_mlp": 1.04203475, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.05322555202635646, + "language_loss": 0.84398592, + "learning_rate": 0.00010362380983601505, + "loss": 0.85468352, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.27758789, + "step": 4146, + "time_per_iteration": 2.546143054962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03938699, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.05187096482218071, + "language_loss": 0.78898019, + "learning_rate": 0.00010343398810332477, + "loss": 0.79965299, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.27905273, + "step": 4147, + "time_per_iteration": 3.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04465318, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.0650162065800976, + "language_loss": 0.84200764, + "learning_rate": 0.00010324432033121467, + "loss": 0.85273361, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2800293, + "step": 4148, + "time_per_iteration": 2.9164648056030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04207134, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.06518493190513895, + "language_loss": 0.83341253, + "learning_rate": 0.00010305480659332005, + "loss": 0.84412122, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.28808594, + "step": 4149, + "time_per_iteration": 2.6469006538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.04290879, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06242001263980543, + "language_loss": 0.83330691, + "learning_rate": 0.00010286544696321682, + "loss": 0.84401828, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.28222656, + "step": 4150, + "time_per_iteration": 2.5429742336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.04543519, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.06754113423442079, + "language_loss": 0.79446447, + "learning_rate": 0.00010267624151442073, + "loss": 0.80520344, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.28417969, + "step": 4151, + "time_per_iteration": 2.6111056804656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107675, + "balance_loss_mlp": 1.04852068, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.0631421524171095, + "language_loss": 0.80901897, + "learning_rate": 0.000102487190320388, + "loss": 0.81978643, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.2824707, + "step": 4152, + "time_per_iteration": 3.323118209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068338, + "balance_loss_mlp": 1.04015708, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.0589010586848655, + "language_loss": 0.79593813, + "learning_rate": 0.00010229829345451475, + "loss": 0.80662155, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.28198242, + "step": 4153, + "time_per_iteration": 3.364107370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071047, + "balance_loss_mlp": 1.04329467, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.06516359919102382, + "language_loss": 0.79660934, + "learning_rate": 0.00010210955099013724, + "loss": 0.80731982, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.27758789, + "step": 4154, + "time_per_iteration": 3.413896322250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070605, + "balance_loss_mlp": 1.04247141, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06322395894070157, + "language_loss": 0.76450896, + "learning_rate": 0.00010192096300053167, + "loss": 0.77521503, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.28149414, + "step": 4155, + "time_per_iteration": 3.1282687187194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069737, + "balance_loss_mlp": 1.04179418, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.4084707213419165, + "language_loss": 0.8520155, + "learning_rate": 0.00010173252955891477, + "loss": 0.8627128, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.27929688, + "step": 4156, + "time_per_iteration": 2.78415584564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074464, + "balance_loss_mlp": 1.04685545, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.06643949206963136, + "language_loss": 0.72880185, + "learning_rate": 0.00010154425073844253, + "loss": 0.73954648, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.27612305, + "step": 4157, + "time_per_iteration": 2.73618221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068843, + "balance_loss_mlp": 1.04032815, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.05290023006148714, + "language_loss": 0.82135558, + "learning_rate": 0.00010135612661221138, + "loss": 0.83204401, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.28515625, + "step": 4158, + "time_per_iteration": 2.554800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068433, + "balance_loss_mlp": 1.04008496, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.060322834717302515, + "language_loss": 0.81768221, + "learning_rate": 0.00010116815725325751, + "loss": 0.82836652, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.28344727, + "step": 4159, + "time_per_iteration": 3.2874691486358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077912, + "balance_loss_mlp": 1.04949212, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.0534649619029418, + "language_loss": 0.80202901, + "learning_rate": 0.00010098034273455725, + "loss": 0.8128081, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.28417969, + "step": 4160, + "time_per_iteration": 2.9733405113220215 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 344944048, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9383636514111488.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/training_args.bin b/sft_pretrain/Full_smoe_share/checkpoint-4160/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-4160/zero_to_fp32.py b/sft_pretrain/Full_smoe_share/checkpoint-4160/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-4160/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/config.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9b0c4407eef6bd7d8c22453f95c43fd6ef0981 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/generation_config.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10d92d7e167b591e2118c1ac838fb1a604b23bf1 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3650c9584044eb2dd66f6644ffb46166b94338f74e6565b8d0d71069f7fc72e6 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f7838debc1a0f11f09736b1f2b16a536820968db --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b938b8cb2474ebadad0bc582949d3db174c4185846d6c320d7db27515c94e696 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..6acbbcf4886b897ca9f99f1b65b7098f16c8a5a5 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a404ec64189104061ba49dedbffa952a265eb0a24ae6c8ee2d921551892bd5b2 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..fea3d5c0806e34184aa473b445b5de4252797d23 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb4b10acf407a82b8bba45c46a460dcb7cf9f8e29f4e776b1df73c4730bfb54 +size 396575120 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..531a098a73a1eb7b46455bfe464742b844ecb4aa --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e716b9aa1a1fe5aca85db5eb21719ce63e7e2aa0e4383c36a51dbfc2b8c9aaa1 +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c7cda31bd4ca6548918b66d58a78cc1fbef3e84 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68366233591d186b4d870e9f205effe05bbc7c9b37a280a7bbe29953c4277d2a +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..34fe8da4d9ae8549b17a11db6440dff464831c11 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17aea391fd94f2cdd4986e71ee8b00ca91f6c09242bcfa921d74df60e519853b +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d007a90149b6cb32aef7371685feee0f2115c0d0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a32b1c25e35577f36d60f8aeeb60e6e5d1f0aaa68f010ea8902a160b729995ae +size 2117321480 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/latest b/sft_pretrain/Full_smoe_share/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e2d0c762bdd31468ec17feac5bdd62d38e82ad7 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4a3130c658203a17c37b38def7719edce7c1fef2b626c71523c71c342ff486 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_smoe_share/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e81fb8fbc9fcb4ce884a441290f2acce53ef1be6 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.10615349, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.07166596959521815, + "language_loss": 0.84091032, + "learning_rate": 0.000925888133132719, + "loss": 0.852319, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.34741211, + "step": 1041, + "time_per_iteration": 2.791015148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724521, + "balance_loss_mlp": 1.67225933, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.16089622263247963, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8133496, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.5234375, + "step": 1042, + "time_per_iteration": 4.978717565536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116458, + "balance_loss_mlp": 1.08169639, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.06766738281342395, + "language_loss": 0.80769098, + "learning_rate": 0.0009255613649386244, + "loss": 0.81885552, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.34790039, + "step": 1043, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_mlp": 1.08709943, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.07361728486384381, + "language_loss": 0.78999138, + "learning_rate": 0.0009253977329834838, + "loss": 0.80121642, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.35449219, + "step": 1044, + "time_per_iteration": 2.7036681175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_mlp": 1.07227719, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.08623717161971375, + "language_loss": 0.86596096, + "learning_rate": 0.0009252339358742965, + "loss": 0.87704492, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.36108398, + "step": 1045, + "time_per_iteration": 2.874620199203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118791, + "balance_loss_mlp": 1.08369565, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.06963930913543727, + "language_loss": 0.82984746, + "learning_rate": 0.000925069973674654, + "loss": 0.84103537, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.35107422, + "step": 1046, + "time_per_iteration": 2.628878116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_mlp": 1.07017231, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.07870556033127275, + "language_loss": 0.88610631, + "learning_rate": 0.000924905846448212, + "loss": 0.89716709, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.35913086, + "step": 1047, + "time_per_iteration": 2.747220754623413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0750165, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.10747792176710873, + "language_loss": 0.85372317, + "learning_rate": 0.0009247415542586906, + "loss": 0.86482, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34667969, + "step": 1048, + "time_per_iteration": 2.8556973934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.08285666, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.2214820598260846, + "language_loss": 0.83177209, + "learning_rate": 0.0009245770971698735, + "loss": 0.84296525, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.36450195, + "step": 1049, + "time_per_iteration": 2.9050869941711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132964, + "balance_loss_mlp": 1.09798741, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.08175342307012821, + "language_loss": 0.88327754, + "learning_rate": 0.0009244124752456087, + "loss": 0.89460719, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.34985352, + "step": 1050, + "time_per_iteration": 2.5141613483428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151097, + "balance_loss_mlp": 1.11557305, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.06393011823673703, + "language_loss": 0.85371649, + "learning_rate": 0.0009242476885498081, + "loss": 0.86522746, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.35522461, + "step": 1051, + "time_per_iteration": 2.727687358856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176333, + "balance_loss_mlp": 1.14171457, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.09914193731013146, + "language_loss": 0.80802011, + "learning_rate": 0.0009240827371464474, + "loss": 0.81978351, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.34643555, + "step": 1052, + "time_per_iteration": 2.552121877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191475, + "balance_loss_mlp": 1.15521157, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.1023503287046967, + "language_loss": 0.83863074, + "learning_rate": 0.0009239176210995666, + "loss": 0.85054547, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.36230469, + "step": 1053, + "time_per_iteration": 3.47882342338562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190284, + "balance_loss_mlp": 1.15561819, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.09115683042396579, + "language_loss": 0.93677175, + "learning_rate": 0.0009237523404732695, + "loss": 0.94867456, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34692383, + "step": 1054, + "time_per_iteration": 2.8701720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173476, + "balance_loss_mlp": 1.13838029, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.10782024136876088, + "language_loss": 0.8421399, + "learning_rate": 0.0009235868953317235, + "loss": 0.85387468, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.3515625, + "step": 1055, + "time_per_iteration": 2.8210723400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161281, + "balance_loss_mlp": 1.12682986, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.07346272336072437, + "language_loss": 0.85227096, + "learning_rate": 0.0009234212857391602, + "loss": 0.86388373, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.3449707, + "step": 1056, + "time_per_iteration": 3.2212936878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153084, + "balance_loss_mlp": 1.11727369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.054845505201833546, + "language_loss": 0.89240777, + "learning_rate": 0.000923255511759875, + "loss": 0.90393853, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.3581543, + "step": 1057, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156175, + "balance_loss_mlp": 1.12146115, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.10969304378799022, + "language_loss": 0.84913409, + "learning_rate": 0.000923089573458227, + "loss": 0.86069584, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.34716797, + "step": 1058, + "time_per_iteration": 2.8832740783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.1168946, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.24205150411640483, + "language_loss": 0.83790255, + "learning_rate": 0.0009229234708986392, + "loss": 0.84941626, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.3449707, + "step": 1059, + "time_per_iteration": 2.8837289810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01633401, + "balance_loss_mlp": 1.57885134, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.08953482343612705, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83300292, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.546875, + "step": 1060, + "time_per_iteration": 4.667459011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158699, + "balance_loss_mlp": 1.1247009, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.0736942782322193, + "language_loss": 0.84963936, + "learning_rate": 0.0009225907732636548, + "loss": 0.86122632, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.34033203, + "step": 1061, + "time_per_iteration": 2.7532095909118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_mlp": 1.12954497, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.09512005659435491, + "language_loss": 0.8641578, + "learning_rate": 0.0009224241783174227, + "loss": 0.87580323, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.35009766, + "step": 1062, + "time_per_iteration": 2.683047294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147761, + "balance_loss_mlp": 1.11347604, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.07955707081408017, + "language_loss": 0.85456479, + "learning_rate": 0.0009222574193715802, + "loss": 0.86604244, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.34326172, + "step": 1063, + "time_per_iteration": 2.8293161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_mlp": 1.10474837, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.08617592440024102, + "language_loss": 0.85715151, + "learning_rate": 0.000922090496490869, + "loss": 0.8685447, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.34619141, + "step": 1064, + "time_per_iteration": 2.749298334121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.08865011, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.06572729358097257, + "language_loss": 0.89767212, + "learning_rate": 0.0009219234097400937, + "loss": 0.90891409, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.35595703, + "step": 1065, + "time_per_iteration": 2.8508355617523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_mlp": 1.07175696, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.05918330788086957, + "language_loss": 0.82970631, + "learning_rate": 0.0009217561591841237, + "loss": 0.8407777, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.35400391, + "step": 1066, + "time_per_iteration": 3.3216452598571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102073, + "balance_loss_mlp": 1.06566656, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09526156176010836, + "language_loss": 0.81088316, + "learning_rate": 0.0009215887448878913, + "loss": 0.82190394, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.36401367, + "step": 1067, + "time_per_iteration": 2.596022129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06191611, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.072135210200994, + "language_loss": 0.84963661, + "learning_rate": 0.0009214211669163922, + "loss": 0.86063439, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.37841797, + "step": 1068, + "time_per_iteration": 4.440082311630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096187, + "balance_loss_mlp": 1.05923223, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.07010547570027807, + "language_loss": 0.93398243, + "learning_rate": 0.0009212534253346862, + "loss": 0.94494426, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.36938477, + "step": 1069, + "time_per_iteration": 2.699843406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096083, + "balance_loss_mlp": 1.05912852, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.07799270520419531, + "language_loss": 0.83685625, + "learning_rate": 0.0009210855202078964, + "loss": 0.84781706, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.36962891, + "step": 1070, + "time_per_iteration": 2.5999720096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010932, + "balance_loss_mlp": 1.05810475, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.0723710550133871, + "language_loss": 0.86933672, + "learning_rate": 0.0009209174516012091, + "loss": 0.88026869, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.35131836, + "step": 1071, + "time_per_iteration": 2.503551483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.05794883, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.05962541016594441, + "language_loss": 0.88928151, + "learning_rate": 0.0009207492195798747, + "loss": 0.90020716, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.34667969, + "step": 1072, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094226, + "balance_loss_mlp": 1.05972731, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.06398863953592046, + "language_loss": 0.84846818, + "learning_rate": 0.0009205808242092061, + "loss": 0.85941041, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34521484, + "step": 1073, + "time_per_iteration": 2.644134044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_mlp": 1.06080186, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.06666861242543158, + "language_loss": 0.82488537, + "learning_rate": 0.0009204122655545808, + "loss": 0.83583593, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34277344, + "step": 1074, + "time_per_iteration": 3.3254919052124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.07582152, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.0719401545163873, + "language_loss": 0.81125832, + "learning_rate": 0.0009202435436814388, + "loss": 0.82235849, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.34228516, + "step": 1075, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105303, + "balance_loss_mlp": 1.0707798, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.06775779875999222, + "language_loss": 0.89715004, + "learning_rate": 0.0009200746586552836, + "loss": 0.90820301, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 2.897177219390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_mlp": 1.06869972, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.12065235325240355, + "language_loss": 0.83624744, + "learning_rate": 0.0009199056105416825, + "loss": 0.84727275, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33862305, + "step": 1077, + "time_per_iteration": 3.0771028995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_mlp": 1.07218289, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.06486814220319007, + "language_loss": 0.8622663, + "learning_rate": 0.0009197363994062654, + "loss": 0.8733272, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.33935547, + "step": 1078, + "time_per_iteration": 2.807009696960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112785, + "balance_loss_mlp": 1.07914448, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.06985523034062016, + "language_loss": 0.84313667, + "learning_rate": 0.0009195670253147262, + "loss": 0.85426456, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.33642578, + "step": 1079, + "time_per_iteration": 2.9738564491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114515, + "balance_loss_mlp": 1.0817802, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.09202653272357895, + "language_loss": 0.81912923, + "learning_rate": 0.0009193974883328216, + "loss": 0.8302744, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32739258, + "step": 1080, + "time_per_iteration": 2.639878511428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121501, + "balance_loss_mlp": 1.08721614, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.059797822691547486, + "language_loss": 0.86745334, + "learning_rate": 0.0009192277885263718, + "loss": 0.87866837, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.34326172, + "step": 1081, + "time_per_iteration": 4.060026407241821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.08671248, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.0682125291941454, + "language_loss": 0.86169523, + "learning_rate": 0.0009190579259612602, + "loss": 0.87289995, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33789062, + "step": 1082, + "time_per_iteration": 3.2795815467834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134326, + "balance_loss_mlp": 1.10132933, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.06852391956291448, + "language_loss": 0.86675245, + "learning_rate": 0.000918887900703433, + "loss": 0.87809569, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.33007812, + "step": 1083, + "time_per_iteration": 2.813777208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137242, + "balance_loss_mlp": 1.1025995, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.07184608102087402, + "language_loss": 0.90139276, + "learning_rate": 0.0009187177128188999, + "loss": 0.91276515, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.34667969, + "step": 1084, + "time_per_iteration": 2.4950854778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361857, + "balance_loss_mlp": 1.30883229, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.057507491560350586, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78518397, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.53125, + "step": 1085, + "time_per_iteration": 4.9323132038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.08279717, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.0734883897044225, + "language_loss": 0.85634506, + "learning_rate": 0.000918376849434071, + "loss": 0.86751348, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.34057617, + "step": 1086, + "time_per_iteration": 2.504467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110856, + "balance_loss_mlp": 1.07680964, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07305298195252904, + "language_loss": 0.90630972, + "learning_rate": 0.0009182061740661098, + "loss": 0.91741836, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34057617, + "step": 1087, + "time_per_iteration": 2.5760254859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_mlp": 1.0785315, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05349746945174757, + "language_loss": 0.84760422, + "learning_rate": 0.0009180353363361127, + "loss": 0.85873878, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.34912109, + "step": 1088, + "time_per_iteration": 3.0988333225250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111767, + "balance_loss_mlp": 1.07593286, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.0658577902216117, + "language_loss": 0.81715566, + "learning_rate": 0.0009178643363104044, + "loss": 0.82827336, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.35864258, + "step": 1089, + "time_per_iteration": 3.1410629749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_mlp": 1.07155704, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.10460691940838339, + "language_loss": 0.90569937, + "learning_rate": 0.0009176931740553735, + "loss": 0.91676497, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.35009766, + "step": 1090, + "time_per_iteration": 2.529330253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112911, + "balance_loss_mlp": 1.07698107, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.07113631656774884, + "language_loss": 0.82557011, + "learning_rate": 0.0009175218496374708, + "loss": 0.83669925, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.359375, + "step": 1091, + "time_per_iteration": 3.347742795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110472, + "balance_loss_mlp": 1.07356465, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.08284412758413852, + "language_loss": 0.85813856, + "learning_rate": 0.0009173503631232103, + "loss": 0.86924326, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.36914062, + "step": 1092, + "time_per_iteration": 3.378859758377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.06684804, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.09413161778101656, + "language_loss": 0.81595004, + "learning_rate": 0.0009171787145791691, + "loss": 0.82698447, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.36621094, + "step": 1093, + "time_per_iteration": 3.215574026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_mlp": 1.06214595, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.0806437411167059, + "language_loss": 0.80327773, + "learning_rate": 0.000917006904071987, + "loss": 0.81427377, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.37451172, + "step": 1094, + "time_per_iteration": 2.6117537021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_mlp": 1.06377053, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.08991830585001004, + "language_loss": 0.87576157, + "learning_rate": 0.0009168349316683669, + "loss": 0.88676262, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.36352539, + "step": 1095, + "time_per_iteration": 2.740950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_mlp": 1.06650949, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.06267137937039592, + "language_loss": 0.8218863, + "learning_rate": 0.0009166627974350741, + "loss": 0.83290446, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.35327148, + "step": 1096, + "time_per_iteration": 2.887326240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098665, + "balance_loss_mlp": 1.06206763, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.07019696164219995, + "language_loss": 0.89238816, + "learning_rate": 0.0009164905014389373, + "loss": 0.90337479, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.3659668, + "step": 1097, + "time_per_iteration": 2.7609455585479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105326, + "balance_loss_mlp": 1.06908655, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.06528725154368942, + "language_loss": 0.8638711, + "learning_rate": 0.0009163180437468476, + "loss": 0.87492442, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.36254883, + "step": 1098, + "time_per_iteration": 2.5998973846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096402, + "balance_loss_mlp": 1.06009042, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.06547964129234486, + "language_loss": 0.85908926, + "learning_rate": 0.000916145424425759, + "loss": 0.87005323, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.36303711, + "step": 1099, + "time_per_iteration": 2.6804425716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06601155, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.08063804967749887, + "language_loss": 0.90475744, + "learning_rate": 0.0009159726435426885, + "loss": 0.91577733, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.35986328, + "step": 1100, + "time_per_iteration": 3.1017394065856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_mlp": 1.06499124, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.08023517310436831, + "language_loss": 0.90250683, + "learning_rate": 0.0009157997011647154, + "loss": 0.9135161, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.359375, + "step": 1101, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_mlp": 1.06045425, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05508329212621071, + "language_loss": 0.86001104, + "learning_rate": 0.0009156265973589817, + "loss": 0.87097728, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.36206055, + "step": 1102, + "time_per_iteration": 2.7933261394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097006, + "balance_loss_mlp": 1.06121981, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.06583201442001711, + "language_loss": 0.89802408, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899414, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.35791016, + "step": 1103, + "time_per_iteration": 2.647494316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096343, + "balance_loss_mlp": 1.0598892, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06603869229078199, + "language_loss": 0.87027407, + "learning_rate": 0.0009152799057331156, + "loss": 0.88123751, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.36499023, + "step": 1104, + "time_per_iteration": 3.1623916625976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097231, + "balance_loss_mlp": 1.06134939, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.07161611233178561, + "language_loss": 0.90831178, + "learning_rate": 0.0009151063180475805, + "loss": 0.91928405, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.35913086, + "step": 1105, + "time_per_iteration": 2.5515594482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099591, + "balance_loss_mlp": 1.06516361, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.08899576142412509, + "language_loss": 0.83941323, + "learning_rate": 0.0009149325692034803, + "loss": 0.85040915, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.34472656, + "step": 1106, + "time_per_iteration": 2.561875343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300575, + "balance_loss_mlp": 1.25708735, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.05662804479307553, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80504, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.43554688, + "step": 1107, + "time_per_iteration": 4.880220174789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_mlp": 1.06870413, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.06711298172071122, + "language_loss": 0.87037283, + "learning_rate": 0.0009145845883094678, + "loss": 0.88141322, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.35375977, + "step": 1108, + "time_per_iteration": 3.0598409175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_mlp": 1.06931639, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.06803775359788228, + "language_loss": 0.8464098, + "learning_rate": 0.000914410356394654, + "loss": 0.85746086, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.35839844, + "step": 1109, + "time_per_iteration": 2.776258945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_mlp": 1.06799972, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.052025780444459935, + "language_loss": 0.84733951, + "learning_rate": 0.0009142359635914709, + "loss": 0.85837853, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.35913086, + "step": 1110, + "time_per_iteration": 3.057307243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096278, + "balance_loss_mlp": 1.05996692, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.10914443694781037, + "language_loss": 0.84286684, + "learning_rate": 0.0009140614099676245, + "loss": 0.85382962, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.36328125, + "step": 1111, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.0517633, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.09545242357915729, + "language_loss": 0.82540983, + "learning_rate": 0.0009138866955908821, + "loss": 0.83628869, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.36132812, + "step": 1112, + "time_per_iteration": 2.870765209197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_mlp": 1.06445658, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06321568237144509, + "language_loss": 0.8048408, + "learning_rate": 0.0009137118205290738, + "loss": 0.8158437, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.35864258, + "step": 1113, + "time_per_iteration": 4.381570100784302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_mlp": 1.06091869, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06328361159326604, + "language_loss": 0.89779603, + "learning_rate": 0.0009135367848500924, + "loss": 0.90876651, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.36157227, + "step": 1114, + "time_per_iteration": 2.511164665222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.06034184, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.08987717155463379, + "language_loss": 0.86417669, + "learning_rate": 0.0009133615886218927, + "loss": 0.87514299, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.36303711, + "step": 1115, + "time_per_iteration": 2.7101125717163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089806, + "balance_loss_mlp": 1.05337584, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.07119429557645003, + "language_loss": 0.87869287, + "learning_rate": 0.0009131862319124917, + "loss": 0.88959092, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.36425781, + "step": 1116, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.05648971, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06965010238630005, + "language_loss": 0.83447617, + "learning_rate": 0.0009130107147899691, + "loss": 0.84540606, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.36499023, + "step": 1117, + "time_per_iteration": 2.723092794418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_mlp": 1.05805993, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.055087901571477416, + "language_loss": 0.84983969, + "learning_rate": 0.0009128350373224665, + "loss": 0.8607831, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.36352539, + "step": 1118, + "time_per_iteration": 2.5449509620666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178954, + "balance_loss_mlp": 1.14500344, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.021865185871831474, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82635385, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.33984375, + "step": 1119, + "time_per_iteration": 4.641271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_mlp": 1.06648207, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.07523243301623007, + "language_loss": 0.85678464, + "learning_rate": 0.0009124832016254005, + "loss": 0.86781639, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.36694336, + "step": 1120, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_mlp": 1.06163859, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.07092227494936269, + "language_loss": 0.87677884, + "learning_rate": 0.0009123070435324316, + "loss": 0.88775837, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.36352539, + "step": 1121, + "time_per_iteration": 2.777632236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166186, + "balance_loss_mlp": 1.13337982, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.01899876446696313, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.7904197, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.328125, + "step": 1122, + "time_per_iteration": 4.966520547866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.0522635, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.060329223802114536, + "language_loss": 0.86415493, + "learning_rate": 0.0009119542471995752, + "loss": 0.87504709, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.36938477, + "step": 1123, + "time_per_iteration": 2.8373889923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090311, + "balance_loss_mlp": 1.05438125, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06176848453484022, + "language_loss": 0.81323773, + "learning_rate": 0.0009117776090966554, + "loss": 0.82414079, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.359375, + "step": 1124, + "time_per_iteration": 2.999127149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_mlp": 1.0507102, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.07470238986110685, + "language_loss": 0.86757743, + "learning_rate": 0.0009116008111274899, + "loss": 0.87845105, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.36669922, + "step": 1125, + "time_per_iteration": 3.3534371852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160744, + "balance_loss_mlp": 1.13022673, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.021433456679081614, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80267668, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.3046875, + "step": 1126, + "time_per_iteration": 4.8522608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086571, + "balance_loss_mlp": 1.04975939, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.07895568764354688, + "language_loss": 0.85050654, + "learning_rate": 0.0009112467358650396, + "loss": 0.86137229, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.36816406, + "step": 1127, + "time_per_iteration": 3.157684803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05472374, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.05660039583272807, + "language_loss": 0.86175025, + "learning_rate": 0.0009110694587092192, + "loss": 0.87265825, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.36108398, + "step": 1128, + "time_per_iteration": 2.755575656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.052562, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.077592311143443, + "language_loss": 0.81304091, + "learning_rate": 0.0009108920219620815, + "loss": 0.82392299, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35693359, + "step": 1129, + "time_per_iteration": 2.639261484146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_mlp": 1.05548096, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.06998872933736075, + "language_loss": 0.8949976, + "learning_rate": 0.0009107144256925133, + "loss": 0.90590858, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35620117, + "step": 1130, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096157, + "balance_loss_mlp": 1.0606091, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.08228743876345572, + "language_loss": 0.81527102, + "learning_rate": 0.0009105366699694638, + "loss": 0.82623267, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.35546875, + "step": 1131, + "time_per_iteration": 2.726532220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087405, + "balance_loss_mlp": 1.0526911, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.05363867293402688, + "language_loss": 0.81731898, + "learning_rate": 0.0009103587548619439, + "loss": 0.82819301, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.34741211, + "step": 1132, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.05978799, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.0659512575968049, + "language_loss": 0.85836411, + "learning_rate": 0.0009101806804390261, + "loss": 0.8693251, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.36328125, + "step": 1133, + "time_per_iteration": 2.789860725402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093043, + "balance_loss_mlp": 1.056494, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.06887538910693401, + "language_loss": 0.90261114, + "learning_rate": 0.0009100024467698453, + "loss": 0.91354156, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.3659668, + "step": 1134, + "time_per_iteration": 2.6074166297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.05786586, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07516267041517319, + "language_loss": 0.82424915, + "learning_rate": 0.0009098240539235981, + "loss": 0.83520383, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.37573242, + "step": 1135, + "time_per_iteration": 2.6695401668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095721, + "balance_loss_mlp": 1.05809808, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.07818229339121877, + "language_loss": 0.87811279, + "learning_rate": 0.0009096455019695423, + "loss": 0.88907003, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.3762207, + "step": 1136, + "time_per_iteration": 4.259606838226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.05180001, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.07138569527580692, + "language_loss": 0.89539087, + "learning_rate": 0.000909466790976998, + "loss": 0.90628058, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.37182617, + "step": 1137, + "time_per_iteration": 2.4586610794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086709, + "balance_loss_mlp": 1.0483948, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.07428895088203294, + "language_loss": 0.82083362, + "learning_rate": 0.0009092879210153473, + "loss": 0.83170068, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.38305664, + "step": 1138, + "time_per_iteration": 3.097928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_mlp": 1.04944801, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.07001266476470332, + "language_loss": 0.88581419, + "learning_rate": 0.0009091088921540333, + "loss": 0.89668703, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.37817383, + "step": 1139, + "time_per_iteration": 2.5904369354248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138075, + "balance_loss_mlp": 1.11270714, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.032290681216211516, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76646751, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.25390625, + "step": 1140, + "time_per_iteration": 4.913591623306274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_mlp": 1.05353999, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.1397659602768512, + "language_loss": 0.84288347, + "learning_rate": 0.0009087503580104985, + "loss": 0.85378748, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.36865234, + "step": 1141, + "time_per_iteration": 2.6825575828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_mlp": 1.06602514, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0722566511462073, + "language_loss": 0.79141879, + "learning_rate": 0.0009085708528674728, + "loss": 0.80245048, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.37133789, + "step": 1142, + "time_per_iteration": 2.8078551292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.06551528, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.06720954872782575, + "language_loss": 0.8638975, + "learning_rate": 0.0009083911891031745, + "loss": 0.87494051, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.38793945, + "step": 1143, + "time_per_iteration": 3.1356892585754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.07328963, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.08162422903338651, + "language_loss": 0.91253042, + "learning_rate": 0.0009082113667873553, + "loss": 0.92363143, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3684082, + "step": 1144, + "time_per_iteration": 3.1446304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112165, + "balance_loss_mlp": 1.07387483, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.0676762249982335, + "language_loss": 0.90471655, + "learning_rate": 0.0009080313859898283, + "loss": 0.91583818, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.38256836, + "step": 1145, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110814, + "balance_loss_mlp": 1.07082736, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.13336101787368373, + "language_loss": 0.91929018, + "learning_rate": 0.0009078512467804684, + "loss": 0.93037164, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.37304688, + "step": 1146, + "time_per_iteration": 2.6156158447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105973, + "balance_loss_mlp": 1.06882787, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06165136945539885, + "language_loss": 0.89993024, + "learning_rate": 0.0009076709492292119, + "loss": 0.91098994, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.37133789, + "step": 1147, + "time_per_iteration": 2.617534875869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095299, + "balance_loss_mlp": 1.06032324, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.11177878536303132, + "language_loss": 0.88637269, + "learning_rate": 0.0009074904934060562, + "loss": 0.89732569, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34985352, + "step": 1148, + "time_per_iteration": 2.6782190799713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086783, + "balance_loss_mlp": 1.05237889, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.0637571078176039, + "language_loss": 0.84905714, + "learning_rate": 0.0009073098793810607, + "loss": 0.85992491, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.34423828, + "step": 1149, + "time_per_iteration": 2.956638813018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085311, + "balance_loss_mlp": 1.04969168, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07731387173425769, + "language_loss": 0.8803097, + "learning_rate": 0.000907129107224346, + "loss": 0.89116287, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35595703, + "step": 1150, + "time_per_iteration": 2.724456548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04623771, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.0527541061714234, + "language_loss": 0.88156152, + "learning_rate": 0.0009069481770060939, + "loss": 0.89237529, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35180664, + "step": 1151, + "time_per_iteration": 2.6539950370788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.04811299, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.06610336138884995, + "language_loss": 0.83768857, + "learning_rate": 0.000906767088796548, + "loss": 0.84853232, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.36279297, + "step": 1152, + "time_per_iteration": 3.4304041862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.05147004, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.06692160227790218, + "language_loss": 0.87012255, + "learning_rate": 0.0009065858426660127, + "loss": 0.88099682, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.35986328, + "step": 1153, + "time_per_iteration": 2.639326333999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_mlp": 1.05480099, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.07963844060104928, + "language_loss": 0.84658396, + "learning_rate": 0.0009064044386848543, + "loss": 0.85748196, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.3503418, + "step": 1154, + "time_per_iteration": 2.904387950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_mlp": 1.05992007, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.07985092329826342, + "language_loss": 0.88786525, + "learning_rate": 0.0009062228769234997, + "loss": 0.89881229, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.34838867, + "step": 1155, + "time_per_iteration": 2.547041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095087, + "balance_loss_mlp": 1.05977738, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.067267193175655, + "language_loss": 0.80872244, + "learning_rate": 0.0009060411574524376, + "loss": 0.81967336, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35327148, + "step": 1156, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_mlp": 1.06561852, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.07018019580992392, + "language_loss": 0.87947989, + "learning_rate": 0.0009058592803422178, + "loss": 0.8904835, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34765625, + "step": 1157, + "time_per_iteration": 3.161827564239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_mlp": 1.05688405, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.0269537140509509, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79798073, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.30859375, + "step": 1158, + "time_per_iteration": 4.827271223068237 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_mlp": 1.06608617, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.10870396219255896, + "language_loss": 0.89957273, + "learning_rate": 0.00090549505348681, + "loss": 0.91057909, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.34594727, + "step": 1159, + "time_per_iteration": 2.5724213123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_mlp": 1.08144796, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.06607938149323832, + "language_loss": 0.83976638, + "learning_rate": 0.0009053127038830275, + "loss": 0.85092539, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.3449707, + "step": 1160, + "time_per_iteration": 2.979442834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108838, + "balance_loss_mlp": 1.07538772, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.07010640296313479, + "language_loss": 0.86946774, + "learning_rate": 0.000905130196922898, + "loss": 0.88055611, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3347168, + "step": 1161, + "time_per_iteration": 2.582780361175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_mlp": 1.0797379, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.056850955952103474, + "language_loss": 0.86954904, + "learning_rate": 0.0009049475326772769, + "loss": 0.88069069, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.34472656, + "step": 1162, + "time_per_iteration": 2.572434902191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116085, + "balance_loss_mlp": 1.08270645, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.07142312953148652, + "language_loss": 0.82233834, + "learning_rate": 0.0009047647112170811, + "loss": 0.83349919, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.33398438, + "step": 1163, + "time_per_iteration": 2.7467033863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_mlp": 1.07115388, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.07009650422776509, + "language_loss": 0.87291974, + "learning_rate": 0.0009045817326132876, + "loss": 0.88396937, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.33837891, + "step": 1164, + "time_per_iteration": 3.6699986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096597, + "balance_loss_mlp": 1.06150198, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.07687995911666942, + "language_loss": 0.8312459, + "learning_rate": 0.0009043985969369357, + "loss": 0.84221184, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35131836, + "step": 1165, + "time_per_iteration": 2.8716225624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_mlp": 1.06461644, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.062241931717823204, + "language_loss": 0.84419966, + "learning_rate": 0.0009042153042591245, + "loss": 0.85519511, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.34960938, + "step": 1166, + "time_per_iteration": 2.8038439750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094194, + "balance_loss_mlp": 1.05971861, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.05754676867835885, + "language_loss": 0.85229421, + "learning_rate": 0.0009040318546510146, + "loss": 0.86323619, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.3449707, + "step": 1167, + "time_per_iteration": 3.166391372680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_mlp": 1.06672144, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06328547350255756, + "language_loss": 0.84822267, + "learning_rate": 0.0009038482481838275, + "loss": 0.85923845, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.34887695, + "step": 1168, + "time_per_iteration": 2.6582534313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092575, + "balance_loss_mlp": 1.05726552, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05398415615287821, + "language_loss": 0.8685748, + "learning_rate": 0.0009036644849288455, + "loss": 0.87950051, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35327148, + "step": 1169, + "time_per_iteration": 3.131391763687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_mlp": 1.06735337, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06156740204868492, + "language_loss": 0.85189641, + "learning_rate": 0.0009034805649574118, + "loss": 0.86291689, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.34716797, + "step": 1170, + "time_per_iteration": 2.662177801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093313, + "balance_loss_mlp": 1.05991113, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.07489985201842045, + "language_loss": 0.85256809, + "learning_rate": 0.0009032964883409308, + "loss": 0.86350119, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.33422852, + "step": 1171, + "time_per_iteration": 2.872305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_mlp": 0.9971894, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.01784679187957182, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74073857, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.26171875, + "step": 1172, + "time_per_iteration": 4.968618154525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090705, + "balance_loss_mlp": 1.05649197, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.05674331384718379, + "language_loss": 0.87210125, + "learning_rate": 0.0009029278654587462, + "loss": 0.88300836, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.3425293, + "step": 1173, + "time_per_iteration": 2.5812408924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05043077, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06970392839419266, + "language_loss": 0.82089472, + "learning_rate": 0.0009027433193361548, + "loss": 0.83174634, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.34765625, + "step": 1174, + "time_per_iteration": 2.7284860610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_mlp": 1.0550499, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.05615396633220104, + "language_loss": 0.86867499, + "learning_rate": 0.00090255861685474, + "loss": 0.87957788, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.3527832, + "step": 1175, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_mlp": 1.05040812, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06159717434172949, + "language_loss": 0.91109395, + "learning_rate": 0.0009023737580862095, + "loss": 0.92195278, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.35473633, + "step": 1176, + "time_per_iteration": 2.5320050716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089039, + "balance_loss_mlp": 1.05468273, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05820331342721636, + "language_loss": 0.82901466, + "learning_rate": 0.0009021887431023321, + "loss": 0.83990508, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34399414, + "step": 1177, + "time_per_iteration": 2.619271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094278, + "balance_loss_mlp": 1.05939722, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05650773027793175, + "language_loss": 0.86773884, + "learning_rate": 0.0009020035719749369, + "loss": 0.8786816, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.34912109, + "step": 1178, + "time_per_iteration": 2.7209300994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_mlp": 1.05536032, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.07505314575513819, + "language_loss": 0.77450001, + "learning_rate": 0.0009018182447759136, + "loss": 0.78538495, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.33154297, + "step": 1179, + "time_per_iteration": 2.957627534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.05793107, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0724719412784609, + "language_loss": 0.79327267, + "learning_rate": 0.0009016327615772126, + "loss": 0.80419827, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.34619141, + "step": 1180, + "time_per_iteration": 2.9636237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098683, + "balance_loss_mlp": 1.06425512, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06868963719018656, + "language_loss": 0.87725425, + "learning_rate": 0.0009014471224508451, + "loss": 0.88824105, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34448242, + "step": 1181, + "time_per_iteration": 2.6756978034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101065, + "balance_loss_mlp": 1.06725717, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.08625014316755293, + "language_loss": 0.8279528, + "learning_rate": 0.0009012613274688823, + "loss": 0.83896345, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.33837891, + "step": 1182, + "time_per_iteration": 2.679690361022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106597, + "balance_loss_mlp": 1.0716213, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.07160666852762332, + "language_loss": 0.87420428, + "learning_rate": 0.0009010753767034565, + "loss": 0.8852703, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35009766, + "step": 1183, + "time_per_iteration": 2.56422758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110957, + "balance_loss_mlp": 1.07514668, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07593119142071596, + "language_loss": 0.7905606, + "learning_rate": 0.0009008892702267599, + "loss": 0.80167019, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.35839844, + "step": 1184, + "time_per_iteration": 2.96954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138099, + "balance_loss_mlp": 1.10255075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.08993468677273868, + "language_loss": 0.88719535, + "learning_rate": 0.0009007030081110457, + "loss": 0.89857626, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35571289, + "step": 1185, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.08923352, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.08461110053036625, + "language_loss": 0.84618473, + "learning_rate": 0.000900516590428627, + "loss": 0.85743326, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35668945, + "step": 1186, + "time_per_iteration": 2.6506764888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_mlp": 1.08637488, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.07299458038970587, + "language_loss": 0.89267749, + "learning_rate": 0.0009003300172518778, + "loss": 0.90388483, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34399414, + "step": 1187, + "time_per_iteration": 2.6919267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_mlp": 1.07291603, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.06786881834878318, + "language_loss": 0.83963048, + "learning_rate": 0.0009001432886532321, + "loss": 0.85070467, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.34521484, + "step": 1188, + "time_per_iteration": 2.9668681621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103209, + "balance_loss_mlp": 1.07002091, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06096375157572686, + "language_loss": 0.86560941, + "learning_rate": 0.0008999564047051843, + "loss": 0.87664151, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.33203125, + "step": 1189, + "time_per_iteration": 2.520157814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_mlp": 1.07070816, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.07257222459915597, + "language_loss": 0.84934878, + "learning_rate": 0.0008997693654802894, + "loss": 0.86038733, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.33154297, + "step": 1190, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117207, + "balance_loss_mlp": 1.08375657, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.056681488577390256, + "language_loss": 0.86392069, + "learning_rate": 0.0008995821710511625, + "loss": 0.87509274, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.3347168, + "step": 1191, + "time_per_iteration": 2.727444887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.08369398, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06323137320540088, + "language_loss": 0.85004956, + "learning_rate": 0.0008993948214904786, + "loss": 0.86121625, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.32983398, + "step": 1192, + "time_per_iteration": 2.5774295330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_mlp": 1.06097257, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.030992800338245956, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79508746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.25585938, + "step": 1193, + "time_per_iteration": 4.854384422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.08934152, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06852039575110529, + "language_loss": 0.7808823, + "learning_rate": 0.0008990196572654427, + "loss": 0.79210448, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.32861328, + "step": 1194, + "time_per_iteration": 2.873081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112553, + "balance_loss_mlp": 1.07943714, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.05701230798072306, + "language_loss": 0.87415946, + "learning_rate": 0.0008988318427467426, + "loss": 0.88528502, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.33105469, + "step": 1195, + "time_per_iteration": 2.702685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.06522477, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06940657308766013, + "language_loss": 0.85968834, + "learning_rate": 0.0008986438733877887, + "loss": 0.87066793, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.32739258, + "step": 1196, + "time_per_iteration": 3.4571969509124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096888, + "balance_loss_mlp": 1.06482017, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04726997036122248, + "language_loss": 0.83756924, + "learning_rate": 0.0008984557492615576, + "loss": 0.8485381, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.32055664, + "step": 1197, + "time_per_iteration": 2.9306819438934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090156, + "balance_loss_mlp": 1.05718327, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.05994921168989351, + "language_loss": 0.89349306, + "learning_rate": 0.0008982674704410854, + "loss": 0.90439463, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.32983398, + "step": 1198, + "time_per_iteration": 2.706496238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089604, + "balance_loss_mlp": 1.05648804, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06548245075345789, + "language_loss": 0.7739616, + "learning_rate": 0.0008980790369994682, + "loss": 0.78485769, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.33129883, + "step": 1199, + "time_per_iteration": 2.962169647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109754, + "balance_loss_mlp": 1.06375623, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.06722903582933262, + "language_loss": 0.86851013, + "learning_rate": 0.000897890449009863, + "loss": 0.87948549, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.33813477, + "step": 1200, + "time_per_iteration": 2.6820433139801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092921, + "balance_loss_mlp": 1.05877972, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.051980143810921, + "language_loss": 0.89933294, + "learning_rate": 0.0008977017065454853, + "loss": 0.91026211, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.34179688, + "step": 1201, + "time_per_iteration": 2.6699435710906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_mlp": 1.0640595, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.0699249838794834, + "language_loss": 0.80333388, + "learning_rate": 0.0008975128096796121, + "loss": 0.81432372, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34936523, + "step": 1202, + "time_per_iteration": 2.891552448272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0627346, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.08096245126913681, + "language_loss": 0.85447264, + "learning_rate": 0.0008973237584855794, + "loss": 0.86543471, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.33496094, + "step": 1203, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.06007552, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.07003086272099243, + "language_loss": 0.82261837, + "learning_rate": 0.0008971345530367832, + "loss": 0.83355689, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.33789062, + "step": 1204, + "time_per_iteration": 2.4648683071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_mlp": 1.05619669, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.0706025487590865, + "language_loss": 0.84670615, + "learning_rate": 0.0008969451934066799, + "loss": 0.85760665, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.33862305, + "step": 1205, + "time_per_iteration": 2.7628865242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096032, + "balance_loss_mlp": 1.06274843, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.07866862210425928, + "language_loss": 0.79702371, + "learning_rate": 0.0008967556796687854, + "loss": 0.80798399, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.33276367, + "step": 1206, + "time_per_iteration": 2.8876569271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099743, + "balance_loss_mlp": 1.06746101, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05955020850576899, + "language_loss": 0.83383894, + "learning_rate": 0.0008965660118966752, + "loss": 0.84483635, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.32275391, + "step": 1207, + "time_per_iteration": 2.8915722370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.06087792, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.05733195861059391, + "language_loss": 0.89860612, + "learning_rate": 0.0008963761901639851, + "loss": 0.90953553, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.32055664, + "step": 1208, + "time_per_iteration": 2.839872121810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_mlp": 1.06843603, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.0677808606719883, + "language_loss": 0.83122128, + "learning_rate": 0.0008961862145444103, + "loss": 0.84222686, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.32104492, + "step": 1209, + "time_per_iteration": 2.723395824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_mlp": 1.07726288, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06757554355714504, + "language_loss": 0.8539983, + "learning_rate": 0.0008959960851117059, + "loss": 0.86509824, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.32739258, + "step": 1210, + "time_per_iteration": 2.5843160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.08055305, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.06719057665627333, + "language_loss": 0.83744979, + "learning_rate": 0.0008958058019396868, + "loss": 0.84857744, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.32202148, + "step": 1211, + "time_per_iteration": 2.790137529373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_mlp": 1.07865953, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.061561154104104274, + "language_loss": 0.86634141, + "learning_rate": 0.0008956153651022274, + "loss": 0.877446, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.31787109, + "step": 1212, + "time_per_iteration": 2.6943769454956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107151, + "balance_loss_mlp": 1.07506013, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.056352889191353187, + "language_loss": 0.84060359, + "learning_rate": 0.0008954247746732618, + "loss": 0.85167515, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.32080078, + "step": 1213, + "time_per_iteration": 2.635540723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.07504261, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.059598265922157306, + "language_loss": 0.90450746, + "learning_rate": 0.0008952340307267837, + "loss": 0.91556644, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.30810547, + "step": 1214, + "time_per_iteration": 2.8842196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098908, + "balance_loss_mlp": 1.06817579, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.059513387141436946, + "language_loss": 0.83485198, + "learning_rate": 0.0008950431333368468, + "loss": 0.84584105, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.30688477, + "step": 1215, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098575, + "balance_loss_mlp": 1.06662679, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.05495395288746111, + "language_loss": 0.84313607, + "learning_rate": 0.0008948520825775634, + "loss": 0.85412186, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.31933594, + "step": 1216, + "time_per_iteration": 3.6454994678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099032, + "balance_loss_mlp": 1.06782317, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06066187191945671, + "language_loss": 0.83935732, + "learning_rate": 0.0008946608785231067, + "loss": 0.85034764, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.31176758, + "step": 1217, + "time_per_iteration": 2.9157872200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098088, + "balance_loss_mlp": 1.06599677, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.058216777953853424, + "language_loss": 0.84654021, + "learning_rate": 0.0008944695212477084, + "loss": 0.85752106, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.32080078, + "step": 1218, + "time_per_iteration": 2.473067045211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_mlp": 1.07158232, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.06075167680795146, + "language_loss": 0.86133409, + "learning_rate": 0.0008942780108256599, + "loss": 0.87237012, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.32006836, + "step": 1219, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_mlp": 1.06819737, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.07971641299609675, + "language_loss": 0.86269408, + "learning_rate": 0.0008940863473313121, + "loss": 0.87370056, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.32446289, + "step": 1220, + "time_per_iteration": 2.453798532485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_mlp": 1.0764761, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.07248436265958902, + "language_loss": 0.87226778, + "learning_rate": 0.0008938945308390756, + "loss": 0.88335222, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.31958008, + "step": 1221, + "time_per_iteration": 2.6299164295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092799, + "balance_loss_mlp": 1.06099391, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.0746326386118845, + "language_loss": 0.86801684, + "learning_rate": 0.00089370256142342, + "loss": 0.87894481, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.31787109, + "step": 1222, + "time_per_iteration": 2.7373716831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_mlp": 1.0675782, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.06792905088784162, + "language_loss": 0.84961808, + "learning_rate": 0.0008935104391588746, + "loss": 0.86061692, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.32299805, + "step": 1223, + "time_per_iteration": 2.786801338195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.06850326, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.053660170998325075, + "language_loss": 0.8281433, + "learning_rate": 0.0008933181641200276, + "loss": 0.83915687, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.32861328, + "step": 1224, + "time_per_iteration": 3.1502432823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102432, + "balance_loss_mlp": 1.06948209, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06465671729424353, + "language_loss": 0.85675979, + "learning_rate": 0.0008931257363815271, + "loss": 0.86778408, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.32958984, + "step": 1225, + "time_per_iteration": 2.9370880126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110561, + "balance_loss_mlp": 1.07370961, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.07282820073226746, + "language_loss": 0.89753437, + "learning_rate": 0.0008929331560180798, + "loss": 0.9085905, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.31884766, + "step": 1226, + "time_per_iteration": 2.977869749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122954, + "balance_loss_mlp": 1.09045768, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.053569811561680475, + "language_loss": 0.90818799, + "learning_rate": 0.0008927404231044525, + "loss": 0.91941756, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.32495117, + "step": 1227, + "time_per_iteration": 2.683979034423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111641, + "balance_loss_mlp": 1.07909656, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.06109587035495086, + "language_loss": 0.81612283, + "learning_rate": 0.0008925475377154703, + "loss": 0.82723922, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.32543945, + "step": 1228, + "time_per_iteration": 2.734614610671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119771, + "balance_loss_mlp": 1.08577275, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.06451716518904643, + "language_loss": 0.82344091, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463866, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.34033203, + "step": 1229, + "time_per_iteration": 2.740309000015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_mlp": 1.07561386, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.0665465772726836, + "language_loss": 0.91460836, + "learning_rate": 0.00089216130981104, + "loss": 0.92569423, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.32983398, + "step": 1230, + "time_per_iteration": 3.1343088150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_mlp": 1.07120848, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.061759964990198334, + "language_loss": 0.81970417, + "learning_rate": 0.000891967967445539, + "loss": 0.83074409, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.32788086, + "step": 1231, + "time_per_iteration": 2.67669677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100144, + "balance_loss_mlp": 1.06829166, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04660382532121484, + "language_loss": 0.88927996, + "learning_rate": 0.0008917744729045772, + "loss": 0.90028143, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.31835938, + "step": 1232, + "time_per_iteration": 2.87488055229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098328, + "balance_loss_mlp": 1.06695223, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.054845027384176535, + "language_loss": 0.83439517, + "learning_rate": 0.0008915808262632757, + "loss": 0.84537846, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.31347656, + "step": 1233, + "time_per_iteration": 2.884615659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_mlp": 1.0800519, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.058607558308664987, + "language_loss": 0.93242431, + "learning_rate": 0.0008913870275968148, + "loss": 0.94353569, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.31054688, + "step": 1234, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.07740974, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.0661901036623414, + "language_loss": 0.87537754, + "learning_rate": 0.0008911930769804342, + "loss": 0.88646448, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.3125, + "step": 1235, + "time_per_iteration": 3.247985363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_mlp": 1.08396649, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.053926277509791044, + "language_loss": 0.90842855, + "learning_rate": 0.0008909989744894318, + "loss": 0.91957957, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.31103516, + "step": 1236, + "time_per_iteration": 2.8457424640655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116546, + "balance_loss_mlp": 1.08598089, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.07410834458794652, + "language_loss": 0.81166267, + "learning_rate": 0.0008908047201991649, + "loss": 0.82282805, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.30517578, + "step": 1237, + "time_per_iteration": 2.743232011795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_mlp": 1.07218719, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.0897055957170317, + "language_loss": 0.8615526, + "learning_rate": 0.0008906103141850502, + "loss": 0.87258613, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.3112793, + "step": 1238, + "time_per_iteration": 2.8931751251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_mlp": 1.07164085, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.0595559706342315, + "language_loss": 0.87583494, + "learning_rate": 0.0008904157565225621, + "loss": 0.88686728, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.31567383, + "step": 1239, + "time_per_iteration": 2.681567430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096601, + "balance_loss_mlp": 1.06546402, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07926394914951292, + "language_loss": 0.81636947, + "learning_rate": 0.000890221047287235, + "loss": 0.82733548, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.31103516, + "step": 1240, + "time_per_iteration": 3.5042829513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096214, + "balance_loss_mlp": 1.06450391, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.06383986480013222, + "language_loss": 0.90398014, + "learning_rate": 0.0008900261865546615, + "loss": 0.91494226, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.31689453, + "step": 1241, + "time_per_iteration": 2.656243324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.06533027, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.07463092576288201, + "language_loss": 0.84907639, + "learning_rate": 0.0008898311744004936, + "loss": 0.86005968, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.33007812, + "step": 1242, + "time_per_iteration": 2.7337045669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.05583906, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.057670085451747476, + "language_loss": 0.86718595, + "learning_rate": 0.0008896360109004414, + "loss": 0.87808001, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.3359375, + "step": 1243, + "time_per_iteration": 2.6334750652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090579, + "balance_loss_mlp": 1.05667567, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.055695642571784755, + "language_loss": 0.84363699, + "learning_rate": 0.0008894406961302742, + "loss": 0.85454273, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.33935547, + "step": 1244, + "time_per_iteration": 2.612278699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092282, + "balance_loss_mlp": 1.05840266, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.053835846346086756, + "language_loss": 0.83682489, + "learning_rate": 0.0008892452301658201, + "loss": 0.84774774, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.33911133, + "step": 1245, + "time_per_iteration": 2.999476432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_mlp": 1.06169045, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.07830491582761978, + "language_loss": 0.83242297, + "learning_rate": 0.0008890496130829653, + "loss": 0.84337801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.33837891, + "step": 1246, + "time_per_iteration": 2.6750991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093391, + "balance_loss_mlp": 1.05913019, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.06104300334873528, + "language_loss": 0.85340333, + "learning_rate": 0.0008888538449576555, + "loss": 0.86433721, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.34301758, + "step": 1247, + "time_per_iteration": 2.5646800994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095388, + "balance_loss_mlp": 1.06131816, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.05789610317969602, + "language_loss": 0.82348001, + "learning_rate": 0.0008886579258658944, + "loss": 0.83443391, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.34082031, + "step": 1248, + "time_per_iteration": 2.562016487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.05283499, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.05381401206887855, + "language_loss": 0.84731787, + "learning_rate": 0.0008884618558837446, + "loss": 0.85818857, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.34277344, + "step": 1249, + "time_per_iteration": 2.8163750171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093014, + "balance_loss_mlp": 1.05927801, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.06053052424994898, + "language_loss": 0.86413568, + "learning_rate": 0.0008882656350873273, + "loss": 0.8750658, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.33764648, + "step": 1250, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088368, + "balance_loss_mlp": 1.05546594, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.06849099956300345, + "language_loss": 0.87088066, + "learning_rate": 0.0008880692635528219, + "loss": 0.88176429, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.32910156, + "step": 1251, + "time_per_iteration": 3.0528526306152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.048823, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.06290905233547327, + "language_loss": 0.88876319, + "learning_rate": 0.0008878727413564669, + "loss": 0.89957213, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.32055664, + "step": 1252, + "time_per_iteration": 2.758507251739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.05194211, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.04466256972049361, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81213295, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.2578125, + "step": 1253, + "time_per_iteration": 4.847649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05616474, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.059681429897919615, + "language_loss": 0.78408957, + "learning_rate": 0.0008874792452834528, + "loss": 0.79497254, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.32128906, + "step": 1254, + "time_per_iteration": 2.754746198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06061172, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.07362958371245172, + "language_loss": 0.87187612, + "learning_rate": 0.0008872822715595626, + "loss": 0.88279426, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.31176758, + "step": 1255, + "time_per_iteration": 2.662929058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_mlp": 1.06200314, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.08064600620778418, + "language_loss": 0.86789644, + "learning_rate": 0.0008870851474793598, + "loss": 0.87882906, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.31225586, + "step": 1256, + "time_per_iteration": 2.550830841064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06434524, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.05836545436632832, + "language_loss": 0.89218223, + "learning_rate": 0.0008868878731193752, + "loss": 0.90314561, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.31982422, + "step": 1257, + "time_per_iteration": 2.850184440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095001, + "balance_loss_mlp": 1.06400657, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.05536217997614851, + "language_loss": 0.89056414, + "learning_rate": 0.0008866904485561973, + "loss": 0.90151417, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.30957031, + "step": 1258, + "time_per_iteration": 2.7176461219787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107248, + "balance_loss_mlp": 1.0765636, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.0620425495695956, + "language_loss": 0.82697642, + "learning_rate": 0.000886492873866473, + "loss": 0.83804893, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.30639648, + "step": 1259, + "time_per_iteration": 2.881246328353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_mlp": 1.07631803, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.0764912621319216, + "language_loss": 0.84458697, + "learning_rate": 0.000886295149126908, + "loss": 0.85565412, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.3034668, + "step": 1260, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_mlp": 1.07148254, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05050860424869067, + "language_loss": 0.85437667, + "learning_rate": 0.0008860972744142655, + "loss": 0.86539763, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.30566406, + "step": 1261, + "time_per_iteration": 2.924192190170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_mlp": 1.07146263, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.05198228858732316, + "language_loss": 0.81767958, + "learning_rate": 0.0008858992498053671, + "loss": 0.82869458, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.30004883, + "step": 1262, + "time_per_iteration": 2.8300395011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069733, + "balance_loss_mlp": 1.04455626, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.04093384265265131, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77658486, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.25195312, + "step": 1263, + "time_per_iteration": 4.837641716003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_mlp": 1.07217157, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05948216339756903, + "language_loss": 0.83247912, + "learning_rate": 0.0008855027512063817, + "loss": 0.84351087, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.30957031, + "step": 1264, + "time_per_iteration": 2.7277276515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102812, + "balance_loss_mlp": 1.07191277, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06194442365761257, + "language_loss": 0.8589493, + "learning_rate": 0.0008853042773702292, + "loss": 0.86997747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.30859375, + "step": 1265, + "time_per_iteration": 2.7305567264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_mlp": 1.07197642, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.0568893751116151, + "language_loss": 0.87145638, + "learning_rate": 0.0008851056539456896, + "loss": 0.88248914, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.31274414, + "step": 1266, + "time_per_iteration": 2.6886072158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.06767774, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.06669847345827673, + "language_loss": 0.81623918, + "learning_rate": 0.0008849068810098755, + "loss": 0.82723451, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.31835938, + "step": 1267, + "time_per_iteration": 3.302135705947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_mlp": 1.06049967, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.06302829877877653, + "language_loss": 0.82764143, + "learning_rate": 0.0008847079586399575, + "loss": 0.83856159, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.31494141, + "step": 1268, + "time_per_iteration": 2.469602584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.05755162, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.062034835544456234, + "language_loss": 0.85665154, + "learning_rate": 0.0008845088869131641, + "loss": 0.86753917, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.31176758, + "step": 1269, + "time_per_iteration": 2.6822941303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090407, + "balance_loss_mlp": 1.05864954, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.06778965234687388, + "language_loss": 0.88905638, + "learning_rate": 0.0008843096659067818, + "loss": 0.8999604, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.31738281, + "step": 1270, + "time_per_iteration": 2.594064235687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05697237066827103, + "language_loss": 0.85987377, + "learning_rate": 0.000884110295698155, + "loss": 0.87074518, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.31567383, + "step": 1271, + "time_per_iteration": 2.974696636199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.0512805, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.06068289501227115, + "language_loss": 0.85902673, + "learning_rate": 0.0008839107763646861, + "loss": 0.86986518, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.32568359, + "step": 1272, + "time_per_iteration": 2.607771158218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_mlp": 1.0507555, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.061464799303267155, + "language_loss": 0.9008882, + "learning_rate": 0.0008837111079838353, + "loss": 0.91174459, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.34912109, + "step": 1273, + "time_per_iteration": 2.708512306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0463264, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.06335862765515422, + "language_loss": 0.89847112, + "learning_rate": 0.000883511290633121, + "loss": 0.9092629, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.32861328, + "step": 1274, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.04423904, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04937694398035677, + "language_loss": 0.92408085, + "learning_rate": 0.000883311324390119, + "loss": 0.93485993, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.33691406, + "step": 1275, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.0457077, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.07292672859625873, + "language_loss": 0.80929816, + "learning_rate": 0.0008831112093324629, + "loss": 0.82010162, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.34667969, + "step": 1276, + "time_per_iteration": 3.0507287979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.04209912, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0707858001482728, + "language_loss": 0.88982868, + "learning_rate": 0.0008829109455378444, + "loss": 0.90059322, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.34375, + "step": 1277, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.04284549, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05561589900472309, + "language_loss": 0.86233819, + "learning_rate": 0.000882710533084013, + "loss": 0.87310779, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.34155273, + "step": 1278, + "time_per_iteration": 2.623353958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074564, + "balance_loss_mlp": 1.04013681, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04936271772538766, + "language_loss": 0.89139968, + "learning_rate": 0.0008825099720487755, + "loss": 0.90214527, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.34448242, + "step": 1279, + "time_per_iteration": 2.6549813747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069233, + "balance_loss_mlp": 1.04853857, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.028817901818472227, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76330376, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.20703125, + "step": 1280, + "time_per_iteration": 4.85357141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_mlp": 1.04521215, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.026145975527968417, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79010111, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.20800781, + "step": 1281, + "time_per_iteration": 4.780989408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_mlp": 1.04983163, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.06975718656823436, + "language_loss": 0.89050984, + "learning_rate": 0.0008819073982335619, + "loss": 0.90134096, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.33300781, + "step": 1282, + "time_per_iteration": 2.8345205783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05361331, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.062337694406813374, + "language_loss": 0.84269708, + "learning_rate": 0.0008817062436519235, + "loss": 0.85355437, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.32104492, + "step": 1283, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089504, + "balance_loss_mlp": 1.05612516, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.06365108043104846, + "language_loss": 0.89943874, + "learning_rate": 0.0008815049408787788, + "loss": 0.91033375, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.33398438, + "step": 1284, + "time_per_iteration": 2.5116872787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.04916823, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.059551230096427064, + "language_loss": 0.85302055, + "learning_rate": 0.0008813034899922805, + "loss": 0.86383736, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.32519531, + "step": 1285, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080955, + "balance_loss_mlp": 1.04931688, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06660544793665324, + "language_loss": 0.89506048, + "learning_rate": 0.0008811018910706387, + "loss": 0.90586996, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.31616211, + "step": 1286, + "time_per_iteration": 2.552616834640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_mlp": 1.04756403, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.07038813341767636, + "language_loss": 0.81879961, + "learning_rate": 0.0008809001441921211, + "loss": 0.82959306, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.31762695, + "step": 1287, + "time_per_iteration": 2.704249143600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082412, + "balance_loss_mlp": 1.05132163, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.054805193397824324, + "language_loss": 0.85345185, + "learning_rate": 0.0008806982494350528, + "loss": 0.86427593, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.31054688, + "step": 1288, + "time_per_iteration": 2.65993070602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.05359983, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.05430799794632807, + "language_loss": 0.90285796, + "learning_rate": 0.0008804962068778161, + "loss": 0.91370773, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.31347656, + "step": 1289, + "time_per_iteration": 2.8633711338043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_mlp": 1.05515075, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.06485439157304855, + "language_loss": 0.81069577, + "learning_rate": 0.0008802940165988511, + "loss": 0.82155788, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.31030273, + "step": 1290, + "time_per_iteration": 2.877063274383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_mlp": 1.05341625, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.058113292585204916, + "language_loss": 0.88358063, + "learning_rate": 0.000880091678676655, + "loss": 0.89442384, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.30859375, + "step": 1291, + "time_per_iteration": 2.800182342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088307, + "balance_loss_mlp": 1.05814719, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.05744202885681841, + "language_loss": 0.88709044, + "learning_rate": 0.0008798891931897821, + "loss": 0.89797354, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.30126953, + "step": 1292, + "time_per_iteration": 2.8186981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06009781, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.06335011869227863, + "language_loss": 0.84085584, + "learning_rate": 0.0008796865602168447, + "loss": 0.85176343, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.30615234, + "step": 1293, + "time_per_iteration": 2.5642354488372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06218874, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.055204532335327836, + "language_loss": 0.88449144, + "learning_rate": 0.0008794837798365115, + "loss": 0.89542329, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.30957031, + "step": 1294, + "time_per_iteration": 2.640967607498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_mlp": 1.07256651, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05342912575045942, + "language_loss": 0.88282919, + "learning_rate": 0.0008792808521275089, + "loss": 0.8938638, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.30859375, + "step": 1295, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106969, + "balance_loss_mlp": 1.07638037, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.05542201073335728, + "language_loss": 0.87427896, + "learning_rate": 0.0008790777771686206, + "loss": 0.88534868, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.30541992, + "step": 1296, + "time_per_iteration": 2.5764553546905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_mlp": 1.07934809, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.061211557913471215, + "language_loss": 0.85332036, + "learning_rate": 0.0008788745550386872, + "loss": 0.86441755, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.30322266, + "step": 1297, + "time_per_iteration": 2.635064125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_mlp": 1.08226037, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.055423812451341224, + "language_loss": 0.79893327, + "learning_rate": 0.0008786711858166063, + "loss": 0.81006682, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.31054688, + "step": 1298, + "time_per_iteration": 3.002070903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113917, + "balance_loss_mlp": 1.08387578, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.06342841372026603, + "language_loss": 0.8358891, + "learning_rate": 0.0008784676695813332, + "loss": 0.84702826, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.29980469, + "step": 1299, + "time_per_iteration": 2.941793918609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116177, + "balance_loss_mlp": 1.08573055, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.05313888632052142, + "language_loss": 0.84205985, + "learning_rate": 0.0008782640064118796, + "loss": 0.85322165, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.30395508, + "step": 1300, + "time_per_iteration": 2.9038445949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113921, + "balance_loss_mlp": 1.11441469, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.03742785755303804, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323961, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.24804688, + "step": 1301, + "time_per_iteration": 4.97193169593811 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.0781548, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.06725713094725487, + "language_loss": 0.86707664, + "learning_rate": 0.0008778562395867648, + "loss": 0.87815738, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.29882812, + "step": 1302, + "time_per_iteration": 2.6434335708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109494, + "balance_loss_mlp": 1.064852, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.0573305289073435, + "language_loss": 0.83713615, + "learning_rate": 0.0008776521360894127, + "loss": 0.84808552, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.30029297, + "step": 1303, + "time_per_iteration": 2.664281129837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_mlp": 1.06206167, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.030879512397293623, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80049491, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.25390625, + "step": 1304, + "time_per_iteration": 4.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_mlp": 1.06682515, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.05889583885024225, + "language_loss": 0.90380585, + "learning_rate": 0.0008772434893213186, + "loss": 0.91477358, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.29882812, + "step": 1305, + "time_per_iteration": 2.619591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.06228364, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.05643683756415757, + "language_loss": 0.84055364, + "learning_rate": 0.0008770389462092276, + "loss": 0.85148358, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.30664062, + "step": 1306, + "time_per_iteration": 2.646378517150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_mlp": 1.05860949, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.07421628365380602, + "language_loss": 0.86343837, + "learning_rate": 0.0008768342567176357, + "loss": 0.87434107, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.31640625, + "step": 1307, + "time_per_iteration": 2.807349681854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_mlp": 1.0562675, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.06024308313144323, + "language_loss": 0.90521109, + "learning_rate": 0.0008766294209260107, + "loss": 0.91610324, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.32958984, + "step": 1308, + "time_per_iteration": 2.652209758758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_mlp": 1.05510211, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.07044022402077256, + "language_loss": 0.90948963, + "learning_rate": 0.0008764244389138767, + "loss": 0.92035961, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.31884766, + "step": 1309, + "time_per_iteration": 2.583214044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05386305, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.07007920023055086, + "language_loss": 0.82157373, + "learning_rate": 0.000876219310760815, + "loss": 0.83244258, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.33032227, + "step": 1310, + "time_per_iteration": 2.8652145862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_mlp": 1.05956042, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05921747328918915, + "language_loss": 0.81032491, + "learning_rate": 0.0008760140365464631, + "loss": 0.82124686, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.32641602, + "step": 1311, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05799365, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06933033432447253, + "language_loss": 0.87204492, + "learning_rate": 0.0008758086163505156, + "loss": 0.88295335, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.32861328, + "step": 1312, + "time_per_iteration": 2.5809056758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085438, + "balance_loss_mlp": 1.05253649, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.05785086559723577, + "language_loss": 0.89221275, + "learning_rate": 0.0008756030502527239, + "loss": 0.90306717, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.32910156, + "step": 1313, + "time_per_iteration": 2.8305885791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_mlp": 1.05201209, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05540107069612798, + "language_loss": 0.90540659, + "learning_rate": 0.0008753973383328954, + "loss": 0.91624713, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.3203125, + "step": 1314, + "time_per_iteration": 2.8095338344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_mlp": 1.0518887, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.06960735937341114, + "language_loss": 0.83534479, + "learning_rate": 0.0008751914806708952, + "loss": 0.84618747, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.32373047, + "step": 1315, + "time_per_iteration": 2.6356046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_mlp": 1.05357838, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.05966295966929829, + "language_loss": 0.82178831, + "learning_rate": 0.0008749854773466439, + "loss": 0.83263648, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.31201172, + "step": 1316, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083614, + "balance_loss_mlp": 1.05199969, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.060440864571565875, + "language_loss": 0.84378719, + "learning_rate": 0.0008747793284401192, + "loss": 0.85462332, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.31591797, + "step": 1317, + "time_per_iteration": 2.672581195831299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04701352, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.06760844062466466, + "language_loss": 0.85858786, + "learning_rate": 0.0008745730340313551, + "loss": 0.8693741, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.31591797, + "step": 1318, + "time_per_iteration": 2.7483184337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_mlp": 1.05775118, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.06356165501521222, + "language_loss": 0.84280074, + "learning_rate": 0.0008743665942004422, + "loss": 0.85368681, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.30834961, + "step": 1319, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094218, + "balance_loss_mlp": 1.06362879, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.06511177952096096, + "language_loss": 0.92719352, + "learning_rate": 0.0008741600090275277, + "loss": 0.93813574, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.30541992, + "step": 1320, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_mlp": 1.05758274, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.06459884228420558, + "language_loss": 0.84290528, + "learning_rate": 0.0008739532785928151, + "loss": 0.853791, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.30957031, + "step": 1321, + "time_per_iteration": 3.438142776489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166929, + "balance_loss_mlp": 1.14528096, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.062216562760273944, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7606051, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.21679688, + "step": 1322, + "time_per_iteration": 4.881207466125488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109523, + "balance_loss_mlp": 1.06502271, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.0660267567978659, + "language_loss": 0.8296389, + "learning_rate": 0.0008735393822590908, + "loss": 0.84059119, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.30151367, + "step": 1323, + "time_per_iteration": 2.7254581451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_mlp": 1.06723142, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.07409821223339019, + "language_loss": 0.87412238, + "learning_rate": 0.0008733322165207681, + "loss": 0.88509512, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.30029297, + "step": 1324, + "time_per_iteration": 2.6910648345947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_mlp": 1.07295775, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.06686348955430095, + "language_loss": 0.83012944, + "learning_rate": 0.0008731249058420247, + "loss": 0.84115636, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.29663086, + "step": 1325, + "time_per_iteration": 3.0301432609558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_mlp": 1.07499993, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.057218587703981125, + "language_loss": 0.90547103, + "learning_rate": 0.0008729174503033459, + "loss": 0.91652811, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.30664062, + "step": 1326, + "time_per_iteration": 2.668544292449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07706285, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.08872727493885958, + "language_loss": 0.82430828, + "learning_rate": 0.0008727098499852728, + "loss": 0.83538437, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.30493164, + "step": 1327, + "time_per_iteration": 2.8206427097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102439, + "balance_loss_mlp": 1.07175469, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.05995612334517853, + "language_loss": 0.8945381, + "learning_rate": 0.0008725021049684034, + "loss": 0.90556252, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.30639648, + "step": 1328, + "time_per_iteration": 2.7788021564483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110018, + "balance_loss_mlp": 1.06906641, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.07693053452424695, + "language_loss": 0.82675111, + "learning_rate": 0.000872294215333391, + "loss": 0.83775294, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.31079102, + "step": 1329, + "time_per_iteration": 3.208423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089607, + "balance_loss_mlp": 1.05820751, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05833009001407562, + "language_loss": 0.83099753, + "learning_rate": 0.0008720861811609457, + "loss": 0.84189361, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.3137207, + "step": 1330, + "time_per_iteration": 2.723451614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082701, + "balance_loss_mlp": 1.05122948, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.06841234134213905, + "language_loss": 0.83759737, + "learning_rate": 0.0008718780025318338, + "loss": 0.84842432, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.31445312, + "step": 1331, + "time_per_iteration": 2.7594637870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.05244088, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.059488371229756976, + "language_loss": 0.83890998, + "learning_rate": 0.0008716696795268771, + "loss": 0.84975058, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.31591797, + "step": 1332, + "time_per_iteration": 2.719435453414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.05516648, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.09040651922247907, + "language_loss": 0.85621184, + "learning_rate": 0.0008714612122269538, + "loss": 0.86707628, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.3125, + "step": 1333, + "time_per_iteration": 2.846071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087221, + "balance_loss_mlp": 1.05517721, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.06079891504044088, + "language_loss": 0.8881824, + "learning_rate": 0.0008712526007129982, + "loss": 0.89905459, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.3203125, + "step": 1334, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_mlp": 1.05226636, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06135189476637687, + "language_loss": 0.90600282, + "learning_rate": 0.0008710438450660003, + "loss": 0.91684425, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.31835938, + "step": 1335, + "time_per_iteration": 2.6957638263702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_mlp": 1.04984844, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.09152684925001835, + "language_loss": 0.86861122, + "learning_rate": 0.0008708349453670064, + "loss": 0.87942821, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.31835938, + "step": 1336, + "time_per_iteration": 2.569918632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.04854655, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.055029840901202824, + "language_loss": 0.91123867, + "learning_rate": 0.0008706259016971185, + "loss": 0.92204076, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.31640625, + "step": 1337, + "time_per_iteration": 2.7755186557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_mlp": 1.04554725, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.08019888390454845, + "language_loss": 0.82668757, + "learning_rate": 0.0008704167141374944, + "loss": 0.83746326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.32006836, + "step": 1338, + "time_per_iteration": 2.8559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073148, + "balance_loss_mlp": 1.04184318, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06412343972447931, + "language_loss": 0.88389909, + "learning_rate": 0.0008702073827693482, + "loss": 0.89463055, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.31274414, + "step": 1339, + "time_per_iteration": 2.725090265274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_mlp": 1.04662943, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06471871877048396, + "language_loss": 0.88798392, + "learning_rate": 0.0008699979076739494, + "loss": 0.89876378, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.31323242, + "step": 1340, + "time_per_iteration": 2.9663493633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.04354882, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.0844279622703065, + "language_loss": 0.88438749, + "learning_rate": 0.0008697882889326234, + "loss": 0.89513433, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.31103516, + "step": 1341, + "time_per_iteration": 2.5622262954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05047798, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.07114901487039385, + "language_loss": 0.86560714, + "learning_rate": 0.0008695785266267515, + "loss": 0.87642074, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.30834961, + "step": 1342, + "time_per_iteration": 2.7169957160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_mlp": 1.05309629, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06303738321086937, + "language_loss": 0.82804394, + "learning_rate": 0.0008693686208377704, + "loss": 0.83887577, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.30053711, + "step": 1343, + "time_per_iteration": 2.8591935634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_mlp": 1.06142426, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06465186244058573, + "language_loss": 0.88812125, + "learning_rate": 0.0008691585716471733, + "loss": 0.89902723, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.29150391, + "step": 1344, + "time_per_iteration": 2.6713430881500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099449, + "balance_loss_mlp": 1.07119632, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.0588719911399204, + "language_loss": 0.85261089, + "learning_rate": 0.0008689483791365079, + "loss": 0.86360538, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.28271484, + "step": 1345, + "time_per_iteration": 2.820528030395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.08457518, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.06280839806958106, + "language_loss": 0.89176255, + "learning_rate": 0.0008687380433873786, + "loss": 0.90288818, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.28027344, + "step": 1346, + "time_per_iteration": 2.8161351680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122151, + "balance_loss_mlp": 1.09442306, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.09019918884346267, + "language_loss": 0.82469404, + "learning_rate": 0.0008685275644814448, + "loss": 0.83591551, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.27734375, + "step": 1347, + "time_per_iteration": 2.693267822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_mlp": 1.09403384, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.0763626786758855, + "language_loss": 0.83996952, + "learning_rate": 0.0008683169425004216, + "loss": 0.85119361, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.28393555, + "step": 1348, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.07582057, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.0999879699530973, + "language_loss": 0.82942533, + "learning_rate": 0.0008681061775260799, + "loss": 0.84046841, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.28491211, + "step": 1349, + "time_per_iteration": 2.8389806747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104623, + "balance_loss_mlp": 1.0761795, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06848449496170159, + "language_loss": 0.9182089, + "learning_rate": 0.0008678952696402458, + "loss": 0.92925513, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.28442383, + "step": 1350, + "time_per_iteration": 2.520573377609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091244, + "balance_loss_mlp": 1.06270587, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.06363942150358032, + "language_loss": 0.86753285, + "learning_rate": 0.000867684218924801, + "loss": 0.87844533, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.28564453, + "step": 1351, + "time_per_iteration": 2.9015109539031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094999, + "balance_loss_mlp": 1.07382762, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.03643594447100183, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80042088, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.21191406, + "step": 1352, + "time_per_iteration": 4.897913217544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05987692, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.05004222260192376, + "language_loss": 0.8488791, + "learning_rate": 0.0008672616893328834, + "loss": 0.85977256, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.29394531, + "step": 1353, + "time_per_iteration": 2.930330991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_mlp": 1.05925155, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.06508424080641521, + "language_loss": 0.90170342, + "learning_rate": 0.0008670502106204512, + "loss": 0.91259539, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.29882812, + "step": 1354, + "time_per_iteration": 2.8581433296203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088042, + "balance_loss_mlp": 1.05821621, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.07357469643966064, + "language_loss": 0.81904948, + "learning_rate": 0.0008668385894064892, + "loss": 0.82992983, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.2980957, + "step": 1355, + "time_per_iteration": 2.6258199214935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086225, + "balance_loss_mlp": 1.05565977, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.05598612189883674, + "language_loss": 0.88435078, + "learning_rate": 0.0008666268257731562, + "loss": 0.89521307, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.30517578, + "step": 1356, + "time_per_iteration": 3.0935704708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_mlp": 1.06557548, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.05877228431721195, + "language_loss": 0.85582316, + "learning_rate": 0.0008664149198026662, + "loss": 0.86678505, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.3059082, + "step": 1357, + "time_per_iteration": 3.3150172233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093826, + "balance_loss_mlp": 1.06407189, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.08010917030088013, + "language_loss": 0.88609982, + "learning_rate": 0.0008662028715772883, + "loss": 0.8970381, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.29736328, + "step": 1358, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117948, + "balance_loss_mlp": 1.08781219, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.068011575409632, + "language_loss": 0.8599565, + "learning_rate": 0.0008659906811793467, + "loss": 0.87113595, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.30078125, + "step": 1359, + "time_per_iteration": 2.6895272731781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120144, + "balance_loss_mlp": 1.08917356, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06541737550876531, + "language_loss": 0.89626461, + "learning_rate": 0.0008657783486912215, + "loss": 0.90746599, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.30932617, + "step": 1360, + "time_per_iteration": 2.762763738632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_mlp": 1.09752679, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.08393806981558949, + "language_loss": 0.89884281, + "learning_rate": 0.0008655658741953472, + "loss": 0.91012919, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.31079102, + "step": 1361, + "time_per_iteration": 3.2099156379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108189, + "balance_loss_mlp": 1.07740927, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.05266132623937494, + "language_loss": 0.88221049, + "learning_rate": 0.0008653532577742136, + "loss": 0.89329231, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.30761719, + "step": 1362, + "time_per_iteration": 2.6699323654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097872, + "balance_loss_mlp": 1.06756878, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.06436829867728516, + "language_loss": 0.86740243, + "learning_rate": 0.0008651404995103659, + "loss": 0.87838113, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.30273438, + "step": 1363, + "time_per_iteration": 2.5310258865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094148, + "balance_loss_mlp": 1.06286716, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.05795299669830668, + "language_loss": 0.8642996, + "learning_rate": 0.0008649275994864041, + "loss": 0.87524116, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.3125, + "step": 1364, + "time_per_iteration": 2.675330638885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_mlp": 1.07066512, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05147405231292679, + "language_loss": 0.83778602, + "learning_rate": 0.0008647145577849834, + "loss": 0.84880447, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.31152344, + "step": 1365, + "time_per_iteration": 2.817330837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06913614, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.05119291352940178, + "language_loss": 0.82886052, + "learning_rate": 0.0008645013744888139, + "loss": 0.83985633, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.30395508, + "step": 1366, + "time_per_iteration": 2.9056894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093325, + "balance_loss_mlp": 1.06318903, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.08887633390516779, + "language_loss": 0.8772788, + "learning_rate": 0.0008642880496806607, + "loss": 0.88821203, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.30102539, + "step": 1367, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.0635649, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.0720053964715196, + "language_loss": 0.84128964, + "learning_rate": 0.0008640745834433437, + "loss": 0.85223687, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.3112793, + "step": 1368, + "time_per_iteration": 2.7703893184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_mlp": 1.05559897, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.058958451803685384, + "language_loss": 0.86905044, + "learning_rate": 0.000863860975859738, + "loss": 0.87990516, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.29833984, + "step": 1369, + "time_per_iteration": 2.913543224334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06309724, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.07885033776141591, + "language_loss": 0.87845421, + "learning_rate": 0.0008636472270127733, + "loss": 0.8893891, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.3034668, + "step": 1370, + "time_per_iteration": 2.6615941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093443, + "balance_loss_mlp": 1.06368852, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.06686078076555955, + "language_loss": 0.90047085, + "learning_rate": 0.0008634333369854345, + "loss": 0.91140521, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.29736328, + "step": 1371, + "time_per_iteration": 2.611501932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109652, + "balance_loss_mlp": 1.06666958, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05135890593758564, + "language_loss": 0.87519878, + "learning_rate": 0.0008632193058607608, + "loss": 0.88616395, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.29833984, + "step": 1372, + "time_per_iteration": 2.7420408725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096239, + "balance_loss_mlp": 1.06681848, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.07070265457366111, + "language_loss": 0.80896008, + "learning_rate": 0.0008630051337218466, + "loss": 0.81992251, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.29394531, + "step": 1373, + "time_per_iteration": 2.694157123565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097092, + "balance_loss_mlp": 1.06762338, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.06318549857397857, + "language_loss": 0.8188293, + "learning_rate": 0.0008627908206518409, + "loss": 0.82980019, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.29418945, + "step": 1374, + "time_per_iteration": 2.703380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_mlp": 1.00330341, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.017765090827900253, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76174676, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.20117188, + "step": 1375, + "time_per_iteration": 4.995063781738281 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06237197, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.0561933760173491, + "language_loss": 0.9114545, + "learning_rate": 0.0008623617720514241, + "loss": 0.92238057, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.30224609, + "step": 1376, + "time_per_iteration": 2.666578769683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093572, + "balance_loss_mlp": 1.06276798, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.06268473823371516, + "language_loss": 0.84907627, + "learning_rate": 0.0008621470366875848, + "loss": 0.86001205, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.30761719, + "step": 1377, + "time_per_iteration": 2.576968193054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087995, + "balance_loss_mlp": 1.05661869, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05801174228437736, + "language_loss": 0.87514544, + "learning_rate": 0.0008619321607257966, + "loss": 0.88602537, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.31347656, + "step": 1378, + "time_per_iteration": 2.6873912811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05396187, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.06612008054140536, + "language_loss": 0.81601393, + "learning_rate": 0.000861717144249482, + "loss": 0.82685226, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.2980957, + "step": 1379, + "time_per_iteration": 2.861531972885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082319, + "balance_loss_mlp": 1.05220687, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06041061044303736, + "language_loss": 0.89415485, + "learning_rate": 0.0008615019873421175, + "loss": 0.90497804, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.30053711, + "step": 1380, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080185, + "balance_loss_mlp": 1.04973865, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.12029414194163875, + "language_loss": 0.85435975, + "learning_rate": 0.0008612866900872349, + "loss": 0.86516166, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.30395508, + "step": 1381, + "time_per_iteration": 2.5492422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078246, + "balance_loss_mlp": 1.0483005, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.06111803920627532, + "language_loss": 0.87957448, + "learning_rate": 0.0008610712525684197, + "loss": 0.89035696, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.29882812, + "step": 1382, + "time_per_iteration": 2.632847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_mlp": 1.05356061, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.07781171288722535, + "language_loss": 0.84130585, + "learning_rate": 0.0008608556748693121, + "loss": 0.85214543, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.3034668, + "step": 1383, + "time_per_iteration": 3.246919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.05522013, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.052993237489823604, + "language_loss": 0.85963714, + "learning_rate": 0.000860639957073607, + "loss": 0.87050641, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.31689453, + "step": 1384, + "time_per_iteration": 2.7504889965057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086729, + "balance_loss_mlp": 1.05537665, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.06878538642870029, + "language_loss": 0.87610686, + "learning_rate": 0.0008604240992650534, + "loss": 0.88697416, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.31347656, + "step": 1385, + "time_per_iteration": 2.6546881198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082661, + "balance_loss_mlp": 1.05135679, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.05853696199287041, + "language_loss": 0.89197159, + "learning_rate": 0.0008602081015274545, + "loss": 0.90279818, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.31274414, + "step": 1386, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091919, + "balance_loss_mlp": 1.06061459, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.05264786586341277, + "language_loss": 0.83147365, + "learning_rate": 0.0008599919639446684, + "loss": 0.8423928, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.31274414, + "step": 1387, + "time_per_iteration": 2.6775026321411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_mlp": 1.06126583, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06747698326814106, + "language_loss": 0.79790741, + "learning_rate": 0.000859775686600607, + "loss": 0.80884051, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.3203125, + "step": 1388, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090634, + "balance_loss_mlp": 1.05921042, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.06336986871451572, + "language_loss": 0.84764999, + "learning_rate": 0.0008595592695792367, + "loss": 0.85855639, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.31396484, + "step": 1389, + "time_per_iteration": 2.6549055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.06593931, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.055901377362424544, + "language_loss": 0.90619266, + "learning_rate": 0.0008593427129645778, + "loss": 0.91716409, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.31176758, + "step": 1390, + "time_per_iteration": 2.6070477962493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_mlp": 1.06542134, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.06788313950064188, + "language_loss": 0.85213327, + "learning_rate": 0.0008591260168407052, + "loss": 0.86309791, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.31005859, + "step": 1391, + "time_per_iteration": 2.794921398162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_mlp": 1.05963671, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.052723370404498295, + "language_loss": 0.82993329, + "learning_rate": 0.0008589091812917479, + "loss": 0.84085703, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.32739258, + "step": 1392, + "time_per_iteration": 2.634734869003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088674, + "balance_loss_mlp": 1.05727446, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.06846284491975779, + "language_loss": 0.85420829, + "learning_rate": 0.0008586922064018887, + "loss": 0.86509502, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.3137207, + "step": 1393, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108591, + "balance_loss_mlp": 1.05408156, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.07721778370466406, + "language_loss": 0.89049023, + "learning_rate": 0.0008584750922553651, + "loss": 0.90134937, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.31811523, + "step": 1394, + "time_per_iteration": 3.15010666847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_mlp": 1.05053067, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.054821616219537066, + "language_loss": 0.83275163, + "learning_rate": 0.0008582578389364677, + "loss": 0.8435728, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.31567383, + "step": 1395, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086932, + "balance_loss_mlp": 1.05469775, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.049938668546041676, + "language_loss": 0.91772366, + "learning_rate": 0.0008580404465295422, + "loss": 0.92859298, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.32226562, + "step": 1396, + "time_per_iteration": 2.8488125801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_mlp": 1.04891562, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.06204428603549851, + "language_loss": 0.87966394, + "learning_rate": 0.0008578229151189876, + "loss": 0.89045662, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.30297852, + "step": 1397, + "time_per_iteration": 2.92258620262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081241, + "balance_loss_mlp": 1.04867268, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.06429333021146523, + "language_loss": 0.81249309, + "learning_rate": 0.0008576052447892573, + "loss": 0.82330555, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.32568359, + "step": 1398, + "time_per_iteration": 2.551042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.05163908, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.0671833421183549, + "language_loss": 0.86040235, + "learning_rate": 0.000857387435624858, + "loss": 0.87124133, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.32250977, + "step": 1399, + "time_per_iteration": 2.5816056728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086843, + "balance_loss_mlp": 1.05382252, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.05003222473195782, + "language_loss": 0.87953913, + "learning_rate": 0.0008571694877103513, + "loss": 0.89040762, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.33032227, + "step": 1400, + "time_per_iteration": 3.256469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108756, + "balance_loss_mlp": 1.05542135, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.056643414184275494, + "language_loss": 0.87665725, + "learning_rate": 0.0008569514011303515, + "loss": 0.88753277, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.32128906, + "step": 1401, + "time_per_iteration": 2.782273054122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_mlp": 1.05275857, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06127144796082157, + "language_loss": 0.8767277, + "learning_rate": 0.0008567331759695277, + "loss": 0.88757378, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.31835938, + "step": 1402, + "time_per_iteration": 2.696514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_mlp": 1.05178595, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.07491599518741582, + "language_loss": 0.86524475, + "learning_rate": 0.0008565148123126023, + "loss": 0.87609023, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.32763672, + "step": 1403, + "time_per_iteration": 2.6686785221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088194, + "balance_loss_mlp": 1.05510116, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.050644669708274456, + "language_loss": 0.8574301, + "learning_rate": 0.0008562963102443516, + "loss": 0.86831206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.33105469, + "step": 1404, + "time_per_iteration": 2.693836212158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05232334, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.06951419199959312, + "language_loss": 0.84958577, + "learning_rate": 0.0008560776698496056, + "loss": 0.8604449, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.33618164, + "step": 1405, + "time_per_iteration": 2.892805814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_mlp": 1.05093896, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.07287556066439085, + "language_loss": 0.85794389, + "learning_rate": 0.0008558588912132481, + "loss": 0.8687861, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.33300781, + "step": 1406, + "time_per_iteration": 2.821922540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098005, + "balance_loss_mlp": 1.07587957, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.044578698770804955, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77556992, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.22167969, + "step": 1407, + "time_per_iteration": 4.952622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082949, + "balance_loss_mlp": 1.05016637, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.05991157104862915, + "language_loss": 0.82959783, + "learning_rate": 0.0008554209195555016, + "loss": 0.84042734, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.32788086, + "step": 1408, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_mlp": 1.05403042, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.06960051295953752, + "language_loss": 0.88047969, + "learning_rate": 0.0008552017267041483, + "loss": 0.89133757, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.31738281, + "step": 1409, + "time_per_iteration": 2.7926084995269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_mlp": 1.06134176, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.07424010893339522, + "language_loss": 0.8324914, + "learning_rate": 0.0008549823959512549, + "loss": 0.8434236, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.31860352, + "step": 1410, + "time_per_iteration": 2.660325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.06724083, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.062062202361739795, + "language_loss": 0.86755967, + "learning_rate": 0.0008547629273819728, + "loss": 0.87854296, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.31054688, + "step": 1411, + "time_per_iteration": 3.3994545936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098737, + "balance_loss_mlp": 1.06736147, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06335672358829844, + "language_loss": 0.83453959, + "learning_rate": 0.0008545433210815074, + "loss": 0.84552693, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.31347656, + "step": 1412, + "time_per_iteration": 2.644434690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_mlp": 1.07123613, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.06340025797507488, + "language_loss": 0.87345338, + "learning_rate": 0.0008543235771351176, + "loss": 0.88448215, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.31616211, + "step": 1413, + "time_per_iteration": 2.7854721546173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098411, + "balance_loss_mlp": 1.0675596, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.05399278560092938, + "language_loss": 0.84545946, + "learning_rate": 0.0008541036956281154, + "loss": 0.85644352, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.30834961, + "step": 1414, + "time_per_iteration": 2.8788704872131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_mlp": 1.06056201, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.07883268546047513, + "language_loss": 0.81883514, + "learning_rate": 0.0008538836766458665, + "loss": 0.82975471, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.3137207, + "step": 1415, + "time_per_iteration": 2.8526153564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087599, + "balance_loss_mlp": 1.05732012, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.060849568603238105, + "language_loss": 0.84889638, + "learning_rate": 0.0008536635202737897, + "loss": 0.85977244, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.30224609, + "step": 1416, + "time_per_iteration": 2.837353467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_mlp": 1.05903983, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.07898075745209039, + "language_loss": 0.82057679, + "learning_rate": 0.0008534432265973573, + "loss": 0.83147448, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.30688477, + "step": 1417, + "time_per_iteration": 2.5948355197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091815, + "balance_loss_mlp": 1.05891299, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.06605458024108496, + "language_loss": 0.87714171, + "learning_rate": 0.000853222795702095, + "loss": 0.88805991, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.32910156, + "step": 1418, + "time_per_iteration": 3.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109188, + "balance_loss_mlp": 1.05842948, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.04642939327926388, + "language_loss": 0.83471483, + "learning_rate": 0.0008530022276735813, + "loss": 0.84563363, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.33447266, + "step": 1419, + "time_per_iteration": 2.711695432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_mlp": 1.05293703, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.05938997521105461, + "language_loss": 0.85724676, + "learning_rate": 0.0008527815225974489, + "loss": 0.86811179, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.3359375, + "step": 1420, + "time_per_iteration": 2.648448944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_mlp": 1.05407453, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.07492898694353861, + "language_loss": 0.87982917, + "learning_rate": 0.0008525606805593829, + "loss": 0.89069438, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.32446289, + "step": 1421, + "time_per_iteration": 2.4182560443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_mlp": 1.04997277, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.06962089633364145, + "language_loss": 0.82760686, + "learning_rate": 0.0008523397016451213, + "loss": 0.83843112, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.32446289, + "step": 1422, + "time_per_iteration": 2.587892532348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05021799, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.053513553181154576, + "language_loss": 0.8711561, + "learning_rate": 0.0008521185859404564, + "loss": 0.88199091, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.33276367, + "step": 1423, + "time_per_iteration": 3.372192859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_mlp": 1.0513202, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.059986100163812936, + "language_loss": 0.89238524, + "learning_rate": 0.0008518973335312326, + "loss": 0.90323293, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.33447266, + "step": 1424, + "time_per_iteration": 2.791482448577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082662, + "balance_loss_mlp": 1.04921198, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.06956472940992567, + "language_loss": 0.8333236, + "learning_rate": 0.0008516759445033477, + "loss": 0.84415025, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.3347168, + "step": 1425, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082757, + "balance_loss_mlp": 1.05088091, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.0615305422895171, + "language_loss": 0.84459686, + "learning_rate": 0.0008514544189427526, + "loss": 0.85542446, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.31860352, + "step": 1426, + "time_per_iteration": 2.797384738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_mlp": 1.06143463, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061840511174045036, + "language_loss": 0.86558306, + "learning_rate": 0.0008512327569354511, + "loss": 0.87652624, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.32885742, + "step": 1427, + "time_per_iteration": 2.533623695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06418157, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.06551541099381472, + "language_loss": 0.83328068, + "learning_rate": 0.0008510109585675001, + "loss": 0.84424412, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.3215332, + "step": 1428, + "time_per_iteration": 2.623915672302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10653293, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.06717437310459566, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82279044, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.19140625, + "step": 1429, + "time_per_iteration": 4.737167596817017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096832, + "balance_loss_mlp": 1.06517005, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.06718416370196487, + "language_loss": 0.80457842, + "learning_rate": 0.0008505669530941415, + "loss": 0.81554675, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.31640625, + "step": 1430, + "time_per_iteration": 3.380617141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_mlp": 1.07169294, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.06498994038544256, + "language_loss": 0.83560073, + "learning_rate": 0.000850344746161112, + "loss": 0.8466357, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.31787109, + "step": 1431, + "time_per_iteration": 2.5917775630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_mlp": 1.06883883, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.06649249705457211, + "language_loss": 0.87664711, + "learning_rate": 0.0008501224032121894, + "loss": 0.88765645, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.32080078, + "step": 1432, + "time_per_iteration": 2.493826150894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_mlp": 1.06906962, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.06530156063230687, + "language_loss": 0.8172394, + "learning_rate": 0.0008498999243336946, + "loss": 0.82825768, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.32763672, + "step": 1433, + "time_per_iteration": 2.625955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_mlp": 1.07275844, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.056445052388478564, + "language_loss": 0.87110436, + "learning_rate": 0.0008496773096120021, + "loss": 0.88214689, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.31469727, + "step": 1434, + "time_per_iteration": 2.8644402027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093048, + "balance_loss_mlp": 1.06169593, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.07767765628739494, + "language_loss": 0.84306771, + "learning_rate": 0.0008494545591335381, + "loss": 0.85399818, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.31323242, + "step": 1435, + "time_per_iteration": 2.9069130420684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094657, + "balance_loss_mlp": 1.06366265, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04344696113506711, + "language_loss": 0.86938953, + "learning_rate": 0.0008492316729847823, + "loss": 0.88033605, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.30957031, + "step": 1436, + "time_per_iteration": 2.844926595687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091812, + "balance_loss_mlp": 1.06050754, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055139322891005815, + "language_loss": 0.79749823, + "learning_rate": 0.0008490086512522664, + "loss": 0.80841637, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.31274414, + "step": 1437, + "time_per_iteration": 2.722158670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092682, + "balance_loss_mlp": 1.06121063, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.06334111858493886, + "language_loss": 0.90728873, + "learning_rate": 0.0008487854940225755, + "loss": 0.91821557, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.31445312, + "step": 1438, + "time_per_iteration": 2.43622088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.05991077, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.05907133214000555, + "language_loss": 0.89962572, + "learning_rate": 0.0008485622013823466, + "loss": 0.91054124, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.31616211, + "step": 1439, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093806, + "balance_loss_mlp": 1.06154847, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.06492331678063241, + "language_loss": 0.82635379, + "learning_rate": 0.00084833877341827, + "loss": 0.83729184, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.32250977, + "step": 1440, + "time_per_iteration": 2.625870704650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092721, + "balance_loss_mlp": 1.06139278, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.06674971698169922, + "language_loss": 0.80478823, + "learning_rate": 0.000848115210217088, + "loss": 0.81571543, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.31298828, + "step": 1441, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086558, + "balance_loss_mlp": 1.05410933, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.055312199129178424, + "language_loss": 0.81684244, + "learning_rate": 0.0008478915118655952, + "loss": 0.82770801, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.32446289, + "step": 1442, + "time_per_iteration": 2.714303493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089692, + "balance_loss_mlp": 1.05710077, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.049794988647852687, + "language_loss": 0.86386287, + "learning_rate": 0.0008476676784506393, + "loss": 0.87475979, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.32592773, + "step": 1443, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_mlp": 1.05664372, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.05900532389488003, + "language_loss": 0.82031631, + "learning_rate": 0.0008474437100591201, + "loss": 0.83119631, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.31323242, + "step": 1444, + "time_per_iteration": 3.3359997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084677, + "balance_loss_mlp": 1.05160809, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.054436577911169556, + "language_loss": 0.85231566, + "learning_rate": 0.0008472196067779898, + "loss": 0.86316246, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.33081055, + "step": 1445, + "time_per_iteration": 2.7946455478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080884, + "balance_loss_mlp": 1.04850721, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.08667298623079295, + "language_loss": 0.85239732, + "learning_rate": 0.0008469953686942531, + "loss": 0.86320615, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.32373047, + "step": 1446, + "time_per_iteration": 3.0761613845825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.04927349, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.07591437330096602, + "language_loss": 0.8283245, + "learning_rate": 0.0008467709958949668, + "loss": 0.83914101, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.32373047, + "step": 1447, + "time_per_iteration": 2.7922093868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.0504328, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.0636917665663464, + "language_loss": 0.86192262, + "learning_rate": 0.0008465464884672403, + "loss": 0.8727442, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.31713867, + "step": 1448, + "time_per_iteration": 2.679574966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_mlp": 1.05211091, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06494062959974968, + "language_loss": 0.85664314, + "learning_rate": 0.0008463218464982348, + "loss": 0.86748445, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.32006836, + "step": 1449, + "time_per_iteration": 2.8746044635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05524611, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.05859002353759583, + "language_loss": 0.87554371, + "learning_rate": 0.0008460970700751645, + "loss": 0.88640976, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.31323242, + "step": 1450, + "time_per_iteration": 3.0630292892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.05447531, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06644970008868617, + "language_loss": 0.8732717, + "learning_rate": 0.000845872159285295, + "loss": 0.8841247, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.30786133, + "step": 1451, + "time_per_iteration": 2.7334539890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149095, + "balance_loss_mlp": 1.13173842, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.04059568749878616, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78915942, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17382812, + "step": 1452, + "time_per_iteration": 4.913143634796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087672, + "balance_loss_mlp": 1.05617714, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05755695164820471, + "language_loss": 0.86085773, + "learning_rate": 0.0008454219349544836, + "loss": 0.87173438, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.31469727, + "step": 1453, + "time_per_iteration": 3.3649299144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086718, + "balance_loss_mlp": 1.05569983, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.059728326526783365, + "language_loss": 0.8137995, + "learning_rate": 0.000845196621588334, + "loss": 0.82466674, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.30981445, + "step": 1454, + "time_per_iteration": 2.7774734497070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_mlp": 1.05095196, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.0559695634724148, + "language_loss": 0.76184201, + "learning_rate": 0.0008449711742049706, + "loss": 0.77266252, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.31054688, + "step": 1455, + "time_per_iteration": 2.75393009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_mlp": 1.04814696, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.06397369460964857, + "language_loss": 0.83309555, + "learning_rate": 0.0008447455928919196, + "loss": 0.84389246, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.31518555, + "step": 1456, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082481, + "balance_loss_mlp": 1.05177259, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.06274060179370718, + "language_loss": 0.86886203, + "learning_rate": 0.0008445198777367595, + "loss": 0.87968683, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.30664062, + "step": 1457, + "time_per_iteration": 2.6488282680511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089589, + "balance_loss_mlp": 1.05883336, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.06557026121847803, + "language_loss": 0.8106361, + "learning_rate": 0.0008442940288271208, + "loss": 0.82153201, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.30712891, + "step": 1458, + "time_per_iteration": 2.67258882522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096326, + "balance_loss_mlp": 1.06454456, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.07361561415976156, + "language_loss": 0.86939961, + "learning_rate": 0.0008440680462506856, + "loss": 0.88036287, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.31762695, + "step": 1459, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_mlp": 1.07354569, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.05419081251366802, + "language_loss": 0.86197531, + "learning_rate": 0.0008438419300951883, + "loss": 0.87302566, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.31469727, + "step": 1460, + "time_per_iteration": 2.6306796073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_mlp": 1.07459426, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.08520166677325354, + "language_loss": 0.8634038, + "learning_rate": 0.0008436156804484148, + "loss": 0.87446761, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.31762695, + "step": 1461, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.0698266, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.06649626079325978, + "language_loss": 0.88025403, + "learning_rate": 0.0008433892973982031, + "loss": 0.89127588, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.32348633, + "step": 1462, + "time_per_iteration": 2.572810173034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110576, + "balance_loss_mlp": 1.07333505, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06397092621415032, + "language_loss": 0.85030043, + "learning_rate": 0.0008431627810324431, + "loss": 0.86135799, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.32421875, + "step": 1463, + "time_per_iteration": 2.6855740547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_mlp": 1.0774579, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.06457367310459801, + "language_loss": 0.81006026, + "learning_rate": 0.000842936131439076, + "loss": 0.82115412, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.3190918, + "step": 1464, + "time_per_iteration": 2.5868756771087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_mlp": 1.07188725, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06483114531916107, + "language_loss": 0.87564301, + "learning_rate": 0.0008427093487060951, + "loss": 0.88666582, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.3034668, + "step": 1465, + "time_per_iteration": 2.6775078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.07294393, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05163652452488039, + "language_loss": 0.84608126, + "learning_rate": 0.000842482432921545, + "loss": 0.85712349, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.3125, + "step": 1466, + "time_per_iteration": 2.844379186630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090816, + "balance_loss_mlp": 1.05955911, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.05726454257462379, + "language_loss": 0.86823475, + "learning_rate": 0.0008422553841735225, + "loss": 0.87914288, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.31225586, + "step": 1467, + "time_per_iteration": 2.4838902950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05624461, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.07863392491108157, + "language_loss": 0.8442952, + "learning_rate": 0.0008420282025501757, + "loss": 0.85516858, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.31054688, + "step": 1468, + "time_per_iteration": 2.7528913021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108248, + "balance_loss_mlp": 1.05169988, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.056003117579575636, + "language_loss": 0.852718, + "learning_rate": 0.0008418008881397043, + "loss": 0.86354285, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.30737305, + "step": 1469, + "time_per_iteration": 2.6801319122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078886, + "balance_loss_mlp": 1.0479157, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.04937894089719141, + "language_loss": 0.82587177, + "learning_rate": 0.0008415734410303595, + "loss": 0.83666062, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.30932617, + "step": 1470, + "time_per_iteration": 3.1880481243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04551327, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.053571151454841835, + "language_loss": 0.90790403, + "learning_rate": 0.0008413458613104444, + "loss": 0.91866791, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.30834961, + "step": 1471, + "time_per_iteration": 2.6801347732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.04832768, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.054274543729309115, + "language_loss": 0.82964969, + "learning_rate": 0.0008411181490683129, + "loss": 0.84044528, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.31201172, + "step": 1472, + "time_per_iteration": 2.732304096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107702, + "balance_loss_mlp": 1.04619205, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05901735675502878, + "language_loss": 0.82318664, + "learning_rate": 0.0008408903043923707, + "loss": 0.83395684, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.30786133, + "step": 1473, + "time_per_iteration": 3.0503528118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04906487, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.06313039437285956, + "language_loss": 0.81015414, + "learning_rate": 0.0008406623273710754, + "loss": 0.82095402, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.30883789, + "step": 1474, + "time_per_iteration": 2.606189727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05008459, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06295911479055617, + "language_loss": 0.82597101, + "learning_rate": 0.0008404342180929351, + "loss": 0.83678609, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.31396484, + "step": 1475, + "time_per_iteration": 2.620607614517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_mlp": 1.04222226, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06425181584365489, + "language_loss": 0.81938702, + "learning_rate": 0.00084020597664651, + "loss": 0.83012277, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.31323242, + "step": 1476, + "time_per_iteration": 2.7725043296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_mlp": 1.05232406, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.06074887859321084, + "language_loss": 0.83907133, + "learning_rate": 0.0008399776031204111, + "loss": 0.84990764, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.31274414, + "step": 1477, + "time_per_iteration": 2.7300467491149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092258, + "balance_loss_mlp": 1.06081057, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.05838491012274946, + "language_loss": 0.80185568, + "learning_rate": 0.0008397490976033009, + "loss": 0.81277823, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.31420898, + "step": 1478, + "time_per_iteration": 2.650667905807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080543, + "balance_loss_mlp": 1.062042, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.03640521186287318, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78960192, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.18457031, + "step": 1479, + "time_per_iteration": 4.764774322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07654858, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.05702144306517339, + "language_loss": 0.85150903, + "learning_rate": 0.0008392916909509525, + "loss": 0.86259496, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.3203125, + "step": 1480, + "time_per_iteration": 3.0437960624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_mlp": 1.07289815, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.06780557774925215, + "language_loss": 0.84802043, + "learning_rate": 0.0008390627899932954, + "loss": 0.85906273, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.31298828, + "step": 1481, + "time_per_iteration": 2.596781015396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_mlp": 1.0693903, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.07875184362779108, + "language_loss": 0.88996881, + "learning_rate": 0.000838833757399789, + "loss": 0.90097642, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.31347656, + "step": 1482, + "time_per_iteration": 2.94795560836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.05274367, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.07597770471398792, + "language_loss": 0.80484587, + "learning_rate": 0.0008386045932593515, + "loss": 0.81568611, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.3125, + "step": 1483, + "time_per_iteration": 2.6795289516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079675, + "balance_loss_mlp": 1.0484184, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.05859914190414705, + "language_loss": 0.86136287, + "learning_rate": 0.0008383752976609525, + "loss": 0.8721596, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.31225586, + "step": 1484, + "time_per_iteration": 2.900468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_mlp": 1.04878783, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.0559282187978278, + "language_loss": 0.80215633, + "learning_rate": 0.0008381458706936123, + "loss": 0.81296104, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.31665039, + "step": 1485, + "time_per_iteration": 2.6815216541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.05031872, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.06658109550051822, + "language_loss": 0.87213105, + "learning_rate": 0.0008379163124464025, + "loss": 0.88295019, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.31567383, + "step": 1486, + "time_per_iteration": 2.7246947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098145, + "balance_loss_mlp": 1.06572032, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.06266105362217729, + "language_loss": 0.76595891, + "learning_rate": 0.0008376866230084452, + "loss": 0.77694035, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.32421875, + "step": 1487, + "time_per_iteration": 2.8626444339752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_mlp": 1.07006407, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.07368717199594518, + "language_loss": 0.86109662, + "learning_rate": 0.000837456802468914, + "loss": 0.87212193, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.32470703, + "step": 1488, + "time_per_iteration": 2.5964457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109506, + "balance_loss_mlp": 1.07736683, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.0834333673185767, + "language_loss": 0.85148358, + "learning_rate": 0.0008372268509170331, + "loss": 0.86257863, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.32128906, + "step": 1489, + "time_per_iteration": 2.690129518508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109667, + "balance_loss_mlp": 1.06500769, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.06354137393554884, + "language_loss": 0.84668255, + "learning_rate": 0.0008369967684420779, + "loss": 0.85764927, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.31640625, + "step": 1490, + "time_per_iteration": 2.71195912361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_mlp": 1.0523901, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.054809792311278624, + "language_loss": 0.84395373, + "learning_rate": 0.0008367665551333736, + "loss": 0.85479403, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.31616211, + "step": 1491, + "time_per_iteration": 2.604795217514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05223465, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.06594588712207736, + "language_loss": 0.85254663, + "learning_rate": 0.0008365362110802977, + "loss": 0.86338341, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.31420898, + "step": 1492, + "time_per_iteration": 2.8853299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_mlp": 1.05619645, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.057648204576232445, + "language_loss": 0.82509673, + "learning_rate": 0.0008363057363722773, + "loss": 0.83596557, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.30664062, + "step": 1493, + "time_per_iteration": 2.8410117626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088416, + "balance_loss_mlp": 1.05916238, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.06315135639172008, + "language_loss": 0.8381595, + "learning_rate": 0.0008360751310987906, + "loss": 0.84904373, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.29199219, + "step": 1494, + "time_per_iteration": 2.6032519340515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_mlp": 1.05821633, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.0504042487563093, + "language_loss": 0.85491359, + "learning_rate": 0.0008358443953493666, + "loss": 0.865798, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.30175781, + "step": 1495, + "time_per_iteration": 2.859473943710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095118, + "balance_loss_mlp": 1.06586444, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.05765908021852543, + "language_loss": 0.87930727, + "learning_rate": 0.0008356135292135851, + "loss": 0.89025843, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.29223633, + "step": 1496, + "time_per_iteration": 2.5534088611602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092831, + "balance_loss_mlp": 1.06357718, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06886872222290924, + "language_loss": 0.91869086, + "learning_rate": 0.0008353825327810758, + "loss": 0.92961913, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.29223633, + "step": 1497, + "time_per_iteration": 2.4516804218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.0700376, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.06787386534843613, + "language_loss": 0.81638563, + "learning_rate": 0.00083515140614152, + "loss": 0.8273809, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.29467773, + "step": 1498, + "time_per_iteration": 2.6799356937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_mlp": 1.07136989, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.07094138317708479, + "language_loss": 0.861467, + "learning_rate": 0.0008349201493846485, + "loss": 0.87247133, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.2902832, + "step": 1499, + "time_per_iteration": 2.6408841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_mlp": 1.07190013, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.05864167405563355, + "language_loss": 0.88756049, + "learning_rate": 0.0008346887626002432, + "loss": 0.89857149, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.29174805, + "step": 1500, + "time_per_iteration": 2.527707099914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_mlp": 1.07277215, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.05528939811548228, + "language_loss": 0.8596012, + "learning_rate": 0.000834457245878137, + "loss": 0.87062287, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.29345703, + "step": 1501, + "time_per_iteration": 2.6287105083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097625, + "balance_loss_mlp": 1.0678941, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05829487367290223, + "language_loss": 0.81370407, + "learning_rate": 0.000834225599308212, + "loss": 0.82468033, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.296875, + "step": 1502, + "time_per_iteration": 3.2405459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097665, + "balance_loss_mlp": 1.06762409, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.0632270740356206, + "language_loss": 0.85299563, + "learning_rate": 0.0008339938229804016, + "loss": 0.86397231, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.30029297, + "step": 1503, + "time_per_iteration": 2.736917495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238462, + "balance_loss_mlp": 1.22091448, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.0713987899259734, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76673281, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17578125, + "step": 1504, + "time_per_iteration": 4.942230701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_mlp": 1.0553329, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06317842242163065, + "language_loss": 0.83872586, + "learning_rate": 0.0008335298814111094, + "loss": 0.84958482, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.30517578, + "step": 1505, + "time_per_iteration": 2.552032232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_mlp": 1.05138254, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.05888591645587949, + "language_loss": 0.87955916, + "learning_rate": 0.0008332977163497455, + "loss": 0.89038765, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.31445312, + "step": 1506, + "time_per_iteration": 2.792531728744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080802, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.058262801056698586, + "language_loss": 0.83412617, + "learning_rate": 0.0008330654218907325, + "loss": 0.84493423, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.31616211, + "step": 1507, + "time_per_iteration": 2.67161226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082791, + "balance_loss_mlp": 1.05151033, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.053562219876337476, + "language_loss": 0.8135345, + "learning_rate": 0.0008328329981242548, + "loss": 0.8243624, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3125, + "step": 1508, + "time_per_iteration": 2.8886146545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082272, + "balance_loss_mlp": 1.05006218, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.059525688681207785, + "language_loss": 0.87796283, + "learning_rate": 0.0008326004451405475, + "loss": 0.88878554, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.32202148, + "step": 1509, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081166, + "balance_loss_mlp": 1.04919386, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.06566805569484924, + "language_loss": 0.82636976, + "learning_rate": 0.0008323677630298957, + "loss": 0.83718145, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.31958008, + "step": 1510, + "time_per_iteration": 2.5723018646240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.0500108, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.0587639353811087, + "language_loss": 0.84588593, + "learning_rate": 0.0008321349518826345, + "loss": 0.85671222, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.32617188, + "step": 1511, + "time_per_iteration": 2.7943453788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085904, + "balance_loss_mlp": 1.05417013, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07149106056529789, + "language_loss": 0.94572604, + "learning_rate": 0.0008319020117891491, + "loss": 0.95658505, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.31713867, + "step": 1512, + "time_per_iteration": 2.6216046810150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_mlp": 1.05095613, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.062137158428294176, + "language_loss": 0.87139338, + "learning_rate": 0.0008316689428398751, + "loss": 0.88222551, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.32250977, + "step": 1513, + "time_per_iteration": 2.7016332149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.05217493, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.048438835392173675, + "language_loss": 0.88380623, + "learning_rate": 0.0008314357451252979, + "loss": 0.89463598, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.30761719, + "step": 1514, + "time_per_iteration": 2.7707033157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.05329311, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.17247024929444854, + "language_loss": 0.87881547, + "learning_rate": 0.0008312024187359527, + "loss": 0.88966405, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.31542969, + "step": 1515, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_mlp": 1.04083025, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.05532389066983382, + "language_loss": 0.86925149, + "learning_rate": 0.000830968963762425, + "loss": 0.8799662, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.3059082, + "step": 1516, + "time_per_iteration": 3.024911403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.03955793, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.06371457252332635, + "language_loss": 0.83926201, + "learning_rate": 0.0008307353802953497, + "loss": 0.84996927, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.3112793, + "step": 1517, + "time_per_iteration": 2.6853716373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072896, + "balance_loss_mlp": 1.04202044, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04882989118503786, + "language_loss": 0.86122108, + "learning_rate": 0.0008305016684254125, + "loss": 0.87195003, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.30859375, + "step": 1518, + "time_per_iteration": 2.799062728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_mlp": 1.04589891, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.06769299348115199, + "language_loss": 0.86794329, + "learning_rate": 0.0008302678282433479, + "loss": 0.87871796, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.31542969, + "step": 1519, + "time_per_iteration": 2.607813835144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.0473547, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.06836141022194388, + "language_loss": 0.84857148, + "learning_rate": 0.0008300338598399411, + "loss": 0.85936522, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.32006836, + "step": 1520, + "time_per_iteration": 2.6339783668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079776, + "balance_loss_mlp": 1.04677844, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.07756319993269217, + "language_loss": 0.94405806, + "learning_rate": 0.0008297997633060263, + "loss": 0.9548558, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.33007812, + "step": 1521, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_mlp": 1.03991103, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.05829817081366362, + "language_loss": 0.85078239, + "learning_rate": 0.0008295655387324883, + "loss": 0.86150956, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.328125, + "step": 1522, + "time_per_iteration": 2.8296775817871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072427, + "balance_loss_mlp": 1.04031241, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.07682732219120929, + "language_loss": 0.8501184, + "learning_rate": 0.0008293311862102609, + "loss": 0.8608427, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.32104492, + "step": 1523, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.044366, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0685602534850527, + "language_loss": 0.88674849, + "learning_rate": 0.0008290967058303275, + "loss": 0.89752042, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.32836914, + "step": 1524, + "time_per_iteration": 2.47611403465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04138136, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.06274350285183052, + "language_loss": 0.86149156, + "learning_rate": 0.0008288620976837219, + "loss": 0.87222481, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.31933594, + "step": 1525, + "time_per_iteration": 2.497141122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076595, + "balance_loss_mlp": 1.04409802, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.056882926132582716, + "language_loss": 0.82547259, + "learning_rate": 0.000828627361861527, + "loss": 0.8362385, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.32495117, + "step": 1526, + "time_per_iteration": 2.567631959915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.04157782, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.06286177552115993, + "language_loss": 0.84273493, + "learning_rate": 0.0008283924984548752, + "loss": 0.85347635, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.32568359, + "step": 1527, + "time_per_iteration": 2.8300318717956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_mlp": 1.04270601, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05246647038375997, + "language_loss": 0.84726572, + "learning_rate": 0.0008281575075549485, + "loss": 0.85802233, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.32958984, + "step": 1528, + "time_per_iteration": 2.574363946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_mlp": 1.12400758, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.05743835109314035, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78497207, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.20507812, + "step": 1529, + "time_per_iteration": 4.712693452835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085379, + "balance_loss_mlp": 1.05316901, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06778682509264199, + "language_loss": 0.90275097, + "learning_rate": 0.0008276871436402469, + "loss": 0.9136048, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.32202148, + "step": 1530, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098938, + "balance_loss_mlp": 1.06801534, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05712547612295055, + "language_loss": 0.87684029, + "learning_rate": 0.000827451770808083, + "loss": 0.88782966, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.30908203, + "step": 1531, + "time_per_iteration": 2.6601221561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_mlp": 1.06921971, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.06660356736231628, + "language_loss": 0.82939392, + "learning_rate": 0.0008272162708478674, + "loss": 0.84040606, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.31982422, + "step": 1532, + "time_per_iteration": 2.5689916610717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093792, + "balance_loss_mlp": 1.06234503, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.09954158315547566, + "language_loss": 0.86026615, + "learning_rate": 0.000826980643851029, + "loss": 0.87120402, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.31420898, + "step": 1533, + "time_per_iteration": 2.668490409851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_mlp": 1.06560588, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06068587162994625, + "language_loss": 0.84473491, + "learning_rate": 0.0008267448899090464, + "loss": 0.85570371, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.3125, + "step": 1534, + "time_per_iteration": 2.5667166709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_mlp": 1.08053756, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.07629507960375684, + "language_loss": 0.80660546, + "learning_rate": 0.0008265090091134473, + "loss": 0.81771713, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.3059082, + "step": 1535, + "time_per_iteration": 2.8708250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108767, + "balance_loss_mlp": 1.07793915, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.06117244877185189, + "language_loss": 0.80140841, + "learning_rate": 0.0008262730015558088, + "loss": 0.81249607, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.30786133, + "step": 1536, + "time_per_iteration": 2.872954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.06960511, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.058742702923310866, + "language_loss": 0.82196116, + "learning_rate": 0.0008260368673277574, + "loss": 0.8329612, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.3034668, + "step": 1537, + "time_per_iteration": 3.1321218013763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_mlp": 1.06963336, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.0781542924594719, + "language_loss": 0.83699298, + "learning_rate": 0.0008258006065209682, + "loss": 0.84798855, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.29882812, + "step": 1538, + "time_per_iteration": 2.7713711261749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108634, + "balance_loss_mlp": 1.0791415, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.060396297474130736, + "language_loss": 0.80198979, + "learning_rate": 0.0008255642192271657, + "loss": 0.81307614, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.29443359, + "step": 1539, + "time_per_iteration": 2.770426034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_mlp": 1.07525003, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.061957869610313854, + "language_loss": 0.8370012, + "learning_rate": 0.0008253277055381241, + "loss": 0.8480469, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.29296875, + "step": 1540, + "time_per_iteration": 2.818236827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101049, + "balance_loss_mlp": 1.07196212, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.0808235318545815, + "language_loss": 0.85973728, + "learning_rate": 0.0008250910655456658, + "loss": 0.8707478, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.29052734, + "step": 1541, + "time_per_iteration": 3.122596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097236, + "balance_loss_mlp": 1.06888783, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06915250684599016, + "language_loss": 0.83763367, + "learning_rate": 0.0008248542993416625, + "loss": 0.84860599, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.28369141, + "step": 1542, + "time_per_iteration": 2.5910961627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093651, + "balance_loss_mlp": 1.06408739, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.05605218699384054, + "language_loss": 0.8378318, + "learning_rate": 0.0008246174070180352, + "loss": 0.84876835, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.29516602, + "step": 1543, + "time_per_iteration": 2.6633899211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.06312323, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.07006000939384768, + "language_loss": 0.83787405, + "learning_rate": 0.0008243803886667537, + "loss": 0.84879309, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.28759766, + "step": 1544, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092222, + "balance_loss_mlp": 1.0623486, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.06063612617340172, + "language_loss": 0.78866625, + "learning_rate": 0.0008241432443798364, + "loss": 0.79958844, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.2980957, + "step": 1545, + "time_per_iteration": 2.830487012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095453, + "balance_loss_mlp": 1.06491208, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05072672460675934, + "language_loss": 0.85210156, + "learning_rate": 0.0008239059742493512, + "loss": 0.86305606, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.30493164, + "step": 1546, + "time_per_iteration": 2.7311577796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096869, + "balance_loss_mlp": 1.06654167, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.06216195389248957, + "language_loss": 0.87149853, + "learning_rate": 0.0008236685783674142, + "loss": 0.88246721, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.30273438, + "step": 1547, + "time_per_iteration": 3.122184991836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195158, + "balance_loss_mlp": 1.17408168, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.0711099730375168, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77416348, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.2109375, + "step": 1548, + "time_per_iteration": 4.884527683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_mlp": 1.08190823, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.0721948840315393, + "language_loss": 0.82155961, + "learning_rate": 0.0008231934097178955, + "loss": 0.83268768, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.30859375, + "step": 1549, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_mlp": 1.06845081, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.06744191732210313, + "language_loss": 0.85654205, + "learning_rate": 0.0008229556371347903, + "loss": 0.86754102, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.31420898, + "step": 1550, + "time_per_iteration": 2.973072052001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_mlp": 1.06530416, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.063776129703287, + "language_loss": 0.79039407, + "learning_rate": 0.0008227177391691874, + "loss": 0.80135703, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.30957031, + "step": 1551, + "time_per_iteration": 3.121493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091, + "balance_loss_mlp": 1.05948138, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.06994546641795159, + "language_loss": 0.89363164, + "learning_rate": 0.0008224797159134463, + "loss": 0.90454161, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.31494141, + "step": 1552, + "time_per_iteration": 2.714345932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_mlp": 1.05272293, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.0687696840960861, + "language_loss": 0.83498526, + "learning_rate": 0.0008222415674599765, + "loss": 0.84583527, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.32275391, + "step": 1553, + "time_per_iteration": 3.0709471702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_mlp": 1.05482578, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05942841135237563, + "language_loss": 0.83069479, + "learning_rate": 0.0008220032939012349, + "loss": 0.84156853, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.32543945, + "step": 1554, + "time_per_iteration": 2.6579041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084574, + "balance_loss_mlp": 1.05069458, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.05066559322117623, + "language_loss": 0.87862611, + "learning_rate": 0.0008217648953297277, + "loss": 0.88947189, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.33886719, + "step": 1555, + "time_per_iteration": 2.854501962661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080889, + "balance_loss_mlp": 1.04836845, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06306800858294438, + "language_loss": 0.78177649, + "learning_rate": 0.0008215263718380095, + "loss": 0.79258537, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.32519531, + "step": 1556, + "time_per_iteration": 2.679813861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_mlp": 1.03988135, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.05857921257987888, + "language_loss": 0.84453404, + "learning_rate": 0.0008212877235186833, + "loss": 0.8552593, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.32641602, + "step": 1557, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.0575211, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03849586533955073, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812063, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.16992188, + "step": 1558, + "time_per_iteration": 4.915595531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073624, + "balance_loss_mlp": 1.04193807, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06731849387550101, + "language_loss": 0.80882478, + "learning_rate": 0.0008208100527678611, + "loss": 0.81956106, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.31665039, + "step": 1559, + "time_per_iteration": 2.584726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04162097, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.07382200765663921, + "language_loss": 0.78279877, + "learning_rate": 0.0008205710305218135, + "loss": 0.79353946, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.32446289, + "step": 1560, + "time_per_iteration": 3.0383710861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074163, + "balance_loss_mlp": 1.04302561, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.058207727477831525, + "language_loss": 0.89512408, + "learning_rate": 0.0008203318838190541, + "loss": 0.90586567, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.31103516, + "step": 1561, + "time_per_iteration": 2.76627516746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077695, + "balance_loss_mlp": 1.04662895, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.06168132254821995, + "language_loss": 0.85111785, + "learning_rate": 0.0008200926127524281, + "loss": 0.86189479, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.31030273, + "step": 1562, + "time_per_iteration": 2.6629600524902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_mlp": 1.04641104, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.05613480590592382, + "language_loss": 0.82944739, + "learning_rate": 0.0008198532174148289, + "loss": 0.84022236, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.31054688, + "step": 1563, + "time_per_iteration": 2.7358763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_mlp": 1.042413, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.031593282863211954, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81745368, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.16796875, + "step": 1564, + "time_per_iteration": 4.9148335456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082495, + "balance_loss_mlp": 1.05264509, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.06408713771925002, + "language_loss": 0.88499033, + "learning_rate": 0.0008193740542985244, + "loss": 0.89581525, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.2980957, + "step": 1565, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.04955089, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.05458149708053591, + "language_loss": 0.86310005, + "learning_rate": 0.0008191342867058467, + "loss": 0.87388408, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.28833008, + "step": 1566, + "time_per_iteration": 2.7972991466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.05708098, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.07332398387540356, + "language_loss": 0.8337127, + "learning_rate": 0.0008188943952142509, + "loss": 0.84458339, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.29931641, + "step": 1567, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_mlp": 1.06203008, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.06528974392408285, + "language_loss": 0.82496703, + "learning_rate": 0.0008186543799168711, + "loss": 0.83587217, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.28491211, + "step": 1568, + "time_per_iteration": 3.1478142738342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090151, + "balance_loss_mlp": 1.06170726, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.05489125757590388, + "language_loss": 0.87973905, + "learning_rate": 0.0008184142409068892, + "loss": 0.89064056, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.28466797, + "step": 1569, + "time_per_iteration": 3.0216779708862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085926, + "balance_loss_mlp": 1.05767381, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.055531787765466835, + "language_loss": 0.86334872, + "learning_rate": 0.000818173978277536, + "loss": 0.87420803, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.2824707, + "step": 1570, + "time_per_iteration": 2.679858922958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092107, + "balance_loss_mlp": 1.06378245, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.07890485552513911, + "language_loss": 0.83764422, + "learning_rate": 0.000817933592122089, + "loss": 0.84856522, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.28344727, + "step": 1571, + "time_per_iteration": 2.7156453132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097909, + "balance_loss_mlp": 1.06936991, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.06172775968750255, + "language_loss": 0.83209121, + "learning_rate": 0.0008176930825338749, + "loss": 0.84307027, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.28564453, + "step": 1572, + "time_per_iteration": 2.6125760078430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092858, + "balance_loss_mlp": 1.06474876, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.07609523017386281, + "language_loss": 0.88406599, + "learning_rate": 0.0008174524496062679, + "loss": 0.8949945, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.28100586, + "step": 1573, + "time_per_iteration": 2.9266738891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093192, + "balance_loss_mlp": 1.06472516, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.061281594343297996, + "language_loss": 0.85176635, + "learning_rate": 0.0008172116934326894, + "loss": 0.86269826, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.28466797, + "step": 1574, + "time_per_iteration": 2.78182315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093702, + "balance_loss_mlp": 1.06499696, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.061003462460527645, + "language_loss": 0.87581599, + "learning_rate": 0.0008169708141066097, + "loss": 0.88675308, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.28686523, + "step": 1575, + "time_per_iteration": 2.579521894454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095615, + "balance_loss_mlp": 1.06631374, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06494361929352876, + "language_loss": 0.90285015, + "learning_rate": 0.0008167298117215465, + "loss": 0.91380632, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.29272461, + "step": 1576, + "time_per_iteration": 2.576373815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109664, + "balance_loss_mlp": 1.06729078, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06029453435911351, + "language_loss": 0.87511861, + "learning_rate": 0.0008164886863710649, + "loss": 0.88608503, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.29296875, + "step": 1577, + "time_per_iteration": 2.913679599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06847095, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.06219192746352704, + "language_loss": 0.86087388, + "learning_rate": 0.0008162474381487783, + "loss": 0.87184995, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.29101562, + "step": 1578, + "time_per_iteration": 3.0120038986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089575, + "balance_loss_mlp": 1.05979693, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.07133259007734825, + "language_loss": 0.84352636, + "learning_rate": 0.0008160060671483475, + "loss": 0.85442215, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.29711914, + "step": 1579, + "time_per_iteration": 2.6448450088500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087505, + "balance_loss_mlp": 1.05729711, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.06969729270721756, + "language_loss": 0.83291966, + "learning_rate": 0.0008157645734634809, + "loss": 0.8437947, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.30200195, + "step": 1580, + "time_per_iteration": 2.623994827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219684, + "balance_loss_mlp": 1.20118308, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.06785469110901753, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78116179, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.18457031, + "step": 1581, + "time_per_iteration": 4.945984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134498, + "balance_loss_mlp": 1.11723626, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.04727039603147748, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74348998, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17285156, + "step": 1582, + "time_per_iteration": 4.907581567764282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094198, + "balance_loss_mlp": 1.06482506, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.06103997784231323, + "language_loss": 0.83613545, + "learning_rate": 0.000815039357240067, + "loss": 0.84707743, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.29345703, + "step": 1583, + "time_per_iteration": 2.6569504737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_mlp": 1.07053173, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.05926881191118497, + "language_loss": 0.85445809, + "learning_rate": 0.0008147973737554952, + "loss": 0.86544669, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.28344727, + "step": 1584, + "time_per_iteration": 2.8048319816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105359, + "balance_loss_mlp": 1.07682085, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.06192456547731419, + "language_loss": 0.85451925, + "learning_rate": 0.000814555268055744, + "loss": 0.86557281, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.28540039, + "step": 1585, + "time_per_iteration": 2.6496644020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.08265996, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.06812003210241727, + "language_loss": 0.87046736, + "learning_rate": 0.0008143130402348073, + "loss": 0.88158417, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.28979492, + "step": 1586, + "time_per_iteration": 2.6643214225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_mlp": 1.07644498, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.055468457342214825, + "language_loss": 0.79345113, + "learning_rate": 0.0008140706903867265, + "loss": 0.80450928, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.29345703, + "step": 1587, + "time_per_iteration": 2.793938159942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095768, + "balance_loss_mlp": 1.06610858, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.06572122415162869, + "language_loss": 0.90151691, + "learning_rate": 0.0008138282186055897, + "loss": 0.91247463, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.29614258, + "step": 1588, + "time_per_iteration": 2.7083215713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.06414866, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.07456080522357873, + "language_loss": 0.82026887, + "learning_rate": 0.0008135856249855331, + "loss": 0.83120513, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.29467773, + "step": 1589, + "time_per_iteration": 2.6640753746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05720115, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06169186885540492, + "language_loss": 0.89804673, + "learning_rate": 0.0008133429096207398, + "loss": 0.90891039, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.29125977, + "step": 1590, + "time_per_iteration": 2.7599587440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180768, + "balance_loss_mlp": 1.16407835, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.058161185258212886, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76493025, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.16699219, + "step": 1591, + "time_per_iteration": 4.928807973861694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092058, + "balance_loss_mlp": 1.06149244, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05378358074526122, + "language_loss": 0.86363673, + "learning_rate": 0.0008128571140339123, + "loss": 0.87455726, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.30517578, + "step": 1592, + "time_per_iteration": 2.6374073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.06182945, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.059608258439458016, + "language_loss": 0.87261879, + "learning_rate": 0.0008126140340004805, + "loss": 0.88355112, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.3137207, + "step": 1593, + "time_per_iteration": 2.5177900791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_mlp": 1.07528496, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.05384575425533411, + "language_loss": 0.82083076, + "learning_rate": 0.0008123708325995172, + "loss": 0.83190024, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.31640625, + "step": 1594, + "time_per_iteration": 3.230646848678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_mlp": 1.07466626, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.05828956025392548, + "language_loss": 0.79435146, + "learning_rate": 0.0008121275099254414, + "loss": 0.80541706, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.31884766, + "step": 1595, + "time_per_iteration": 2.902198553085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_mlp": 1.07000458, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.0810481792888773, + "language_loss": 0.87996, + "learning_rate": 0.0008118840660727194, + "loss": 0.89096785, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.30761719, + "step": 1596, + "time_per_iteration": 2.6448442935943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_mlp": 1.05465174, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.06221817840069264, + "language_loss": 0.87278962, + "learning_rate": 0.0008116405011358644, + "loss": 0.88365012, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.3137207, + "step": 1597, + "time_per_iteration": 3.1513490676879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_mlp": 1.05455184, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05780846158028219, + "language_loss": 0.79670262, + "learning_rate": 0.0008113968152094369, + "loss": 0.80755049, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.30175781, + "step": 1598, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_mlp": 1.05160582, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.05742950260468591, + "language_loss": 0.822034, + "learning_rate": 0.0008111530083880438, + "loss": 0.83285123, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.30078125, + "step": 1599, + "time_per_iteration": 2.9002020359039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.05333805, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.066825138462863, + "language_loss": 0.86253393, + "learning_rate": 0.0008109090807663399, + "loss": 0.87336552, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.29760742, + "step": 1600, + "time_per_iteration": 2.8091297149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078593, + "balance_loss_mlp": 1.04921985, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.05248494232095894, + "language_loss": 0.88362008, + "learning_rate": 0.0008106650324390257, + "loss": 0.89440602, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.29370117, + "step": 1601, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080904, + "balance_loss_mlp": 1.05072021, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06836714374526962, + "language_loss": 0.81128752, + "learning_rate": 0.0008104208635008493, + "loss": 0.82209659, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.30151367, + "step": 1602, + "time_per_iteration": 2.6952836513519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_mlp": 1.05665243, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.06376665529861299, + "language_loss": 0.81538713, + "learning_rate": 0.0008101765740466058, + "loss": 0.82624954, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.29541016, + "step": 1603, + "time_per_iteration": 2.4948389530181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080977, + "balance_loss_mlp": 1.05098414, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.06931980864978393, + "language_loss": 0.84338289, + "learning_rate": 0.0008099321641711364, + "loss": 0.85419261, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.29931641, + "step": 1604, + "time_per_iteration": 2.707308769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093892, + "balance_loss_mlp": 1.06249225, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.060864651717696075, + "language_loss": 0.83160985, + "learning_rate": 0.0008096876339693295, + "loss": 0.84254879, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.3137207, + "step": 1605, + "time_per_iteration": 2.731968402862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094701, + "balance_loss_mlp": 1.06353974, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.06509347225319946, + "language_loss": 0.8101337, + "learning_rate": 0.0008094429835361206, + "loss": 0.8210808, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.3112793, + "step": 1606, + "time_per_iteration": 2.9290759563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05914617, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.057098253953708926, + "language_loss": 0.8565855, + "learning_rate": 0.0008091982129664908, + "loss": 0.86748546, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.30810547, + "step": 1607, + "time_per_iteration": 2.698822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087412, + "balance_loss_mlp": 1.05558348, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.06809183454795278, + "language_loss": 0.82921505, + "learning_rate": 0.0008089533223554687, + "loss": 0.8400892, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.31811523, + "step": 1608, + "time_per_iteration": 2.7226502895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.05116844, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05457453553086006, + "language_loss": 0.85192972, + "learning_rate": 0.0008087083117981294, + "loss": 0.86274683, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.30493164, + "step": 1609, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_mlp": 1.04733825, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.05682891267097286, + "language_loss": 0.87723553, + "learning_rate": 0.0008084631813895943, + "loss": 0.88802552, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.31665039, + "step": 1610, + "time_per_iteration": 2.8217973709106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077424, + "balance_loss_mlp": 1.04538095, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.06653230383850259, + "language_loss": 0.83695799, + "learning_rate": 0.0008082179312250315, + "loss": 0.84773219, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.3203125, + "step": 1611, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.13905036, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.03907624866068961, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81013775, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18847656, + "step": 1612, + "time_per_iteration": 4.846347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142611, + "balance_loss_mlp": 1.12401426, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.03590336133433786, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77771938, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.18554688, + "step": 1613, + "time_per_iteration": 5.076608896255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_mlp": 1.05432057, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06574200684353006, + "language_loss": 0.81847739, + "learning_rate": 0.0008074814631475545, + "loss": 0.829337, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.31616211, + "step": 1614, + "time_per_iteration": 3.354888916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_mlp": 1.05552983, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.058665683967318874, + "language_loss": 0.79078931, + "learning_rate": 0.0008072357349114907, + "loss": 0.80165768, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.31274414, + "step": 1615, + "time_per_iteration": 2.66959810256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085653, + "balance_loss_mlp": 1.05427742, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.07028059658598983, + "language_loss": 0.88604105, + "learning_rate": 0.0008069898873959363, + "loss": 0.89689755, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.31347656, + "step": 1616, + "time_per_iteration": 2.652873992919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081821, + "balance_loss_mlp": 1.04932451, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.0549356144381418, + "language_loss": 0.85724425, + "learning_rate": 0.0008067439206963375, + "loss": 0.86806244, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32495117, + "step": 1617, + "time_per_iteration": 2.651966094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_mlp": 1.04707837, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06196009796144799, + "language_loss": 0.86023569, + "learning_rate": 0.0008064978349081873, + "loss": 0.87101597, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.30908203, + "step": 1618, + "time_per_iteration": 2.9655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_mlp": 1.04403007, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.05286958899784421, + "language_loss": 0.86531937, + "learning_rate": 0.0008062516301270245, + "loss": 0.87608671, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.32714844, + "step": 1619, + "time_per_iteration": 2.6688730716705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.04668832, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.04767982292239376, + "language_loss": 0.88103712, + "learning_rate": 0.0008060053064484343, + "loss": 0.89181346, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.30908203, + "step": 1620, + "time_per_iteration": 2.9296655654907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_mlp": 1.04794526, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.062218975842766755, + "language_loss": 0.85253787, + "learning_rate": 0.0008057588639680482, + "loss": 0.86332226, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.3046875, + "step": 1621, + "time_per_iteration": 2.7567451000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077048, + "balance_loss_mlp": 1.04686427, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06694670244497776, + "language_loss": 0.82797694, + "learning_rate": 0.0008055123027815434, + "loss": 0.83874738, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.30151367, + "step": 1622, + "time_per_iteration": 2.9208602905273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077079, + "balance_loss_mlp": 1.04610825, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.1782498685509151, + "language_loss": 0.84590065, + "learning_rate": 0.0008052656229846436, + "loss": 0.85667145, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.30932617, + "step": 1623, + "time_per_iteration": 2.7155866622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073968, + "balance_loss_mlp": 1.04328322, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.060959339396114136, + "language_loss": 0.90353578, + "learning_rate": 0.0008050188246731182, + "loss": 0.91427553, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.30664062, + "step": 1624, + "time_per_iteration": 2.6797330379486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076074, + "balance_loss_mlp": 1.04412627, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.055606567643031936, + "language_loss": 0.81689882, + "learning_rate": 0.0008047719079427834, + "loss": 0.82765961, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.31933594, + "step": 1625, + "time_per_iteration": 3.0065042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_mlp": 1.11031902, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.04475298972307083, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75482148, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.20117188, + "step": 1626, + "time_per_iteration": 4.811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_mlp": 1.04688525, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.07327685166102689, + "language_loss": 0.86126161, + "learning_rate": 0.0008042777196091757, + "loss": 0.87205535, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.32495117, + "step": 1627, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05241048, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.055253724304277024, + "language_loss": 0.81718934, + "learning_rate": 0.0008040304481977643, + "loss": 0.82803679, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.32324219, + "step": 1628, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.0556109, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.07469207399290811, + "language_loss": 0.86699098, + "learning_rate": 0.0008037830587512649, + "loss": 0.87787557, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.32861328, + "step": 1629, + "time_per_iteration": 3.092052459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_mlp": 1.0538609, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.05491200172004239, + "language_loss": 0.78946573, + "learning_rate": 0.0008035355513657224, + "loss": 0.80032265, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.31811523, + "step": 1630, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_mlp": 1.05111051, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.05139869194515267, + "language_loss": 0.92925692, + "learning_rate": 0.0008032879261372279, + "loss": 0.94008344, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.31518555, + "step": 1631, + "time_per_iteration": 2.779520034790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.05868566, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.031013784922197977, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80712551, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.18066406, + "step": 1632, + "time_per_iteration": 5.371822357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_mlp": 1.04828787, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.055553714952817974, + "language_loss": 0.87074977, + "learning_rate": 0.0008027923225359748, + "loss": 0.8815397, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.30688477, + "step": 1633, + "time_per_iteration": 2.6381123065948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_mlp": 1.04797852, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05859649155609266, + "language_loss": 0.88228178, + "learning_rate": 0.0008025443443556267, + "loss": 0.89307147, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.30957031, + "step": 1634, + "time_per_iteration": 2.7031404972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.04785156, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.052081770011180493, + "language_loss": 0.88152099, + "learning_rate": 0.000802296248717147, + "loss": 0.89230251, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.30273438, + "step": 1635, + "time_per_iteration": 2.9598543643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.05080533, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.066530556652877, + "language_loss": 0.78616363, + "learning_rate": 0.0008020480357168554, + "loss": 0.79697067, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.29833984, + "step": 1636, + "time_per_iteration": 2.797565221786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05261683, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.1046412191682548, + "language_loss": 0.87883365, + "learning_rate": 0.0008017997054511165, + "loss": 0.88965666, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.29638672, + "step": 1637, + "time_per_iteration": 2.559032440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078208, + "balance_loss_mlp": 1.04733276, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.05513941849331592, + "language_loss": 0.85624552, + "learning_rate": 0.0008015512580163407, + "loss": 0.86702752, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.30834961, + "step": 1638, + "time_per_iteration": 2.779050827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04363525, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.05557291013478606, + "language_loss": 0.81019449, + "learning_rate": 0.0008013026935089838, + "loss": 0.82094443, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.31323242, + "step": 1639, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04701638, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.06613944709877946, + "language_loss": 0.8358075, + "learning_rate": 0.0008010540120255472, + "loss": 0.84657711, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.29882812, + "step": 1640, + "time_per_iteration": 2.651386260986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077047, + "balance_loss_mlp": 1.0463388, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.07317243700129339, + "language_loss": 0.86339968, + "learning_rate": 0.0008008052136625774, + "loss": 0.87417012, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.30688477, + "step": 1641, + "time_per_iteration": 2.7859702110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077184, + "balance_loss_mlp": 1.04642797, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05078324108170858, + "language_loss": 0.86915755, + "learning_rate": 0.0008005562985166666, + "loss": 0.87992936, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.30712891, + "step": 1642, + "time_per_iteration": 2.770359516143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04775047, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.048579646337906, + "language_loss": 0.85256124, + "learning_rate": 0.0008003072666844524, + "loss": 0.86334682, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.30761719, + "step": 1643, + "time_per_iteration": 2.6892380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081754, + "balance_loss_mlp": 1.05076003, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.06943709441331726, + "language_loss": 0.82542813, + "learning_rate": 0.0008000581182626173, + "loss": 0.83624566, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.30981445, + "step": 1644, + "time_per_iteration": 2.550408124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05496669, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.05777646040930187, + "language_loss": 0.86256635, + "learning_rate": 0.0007998088533478894, + "loss": 0.87341708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.30053711, + "step": 1645, + "time_per_iteration": 2.646522283554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081027, + "balance_loss_mlp": 1.05019915, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07748310873558778, + "language_loss": 0.84388101, + "learning_rate": 0.000799559472037042, + "loss": 0.85469127, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.30786133, + "step": 1646, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081594, + "balance_loss_mlp": 1.05112433, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.0644603274178606, + "language_loss": 0.87469906, + "learning_rate": 0.0007993099744268932, + "loss": 0.88551497, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.30419922, + "step": 1647, + "time_per_iteration": 2.905468225479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074972, + "balance_loss_mlp": 1.04414475, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.06139744482341488, + "language_loss": 0.87846816, + "learning_rate": 0.000799060360614307, + "loss": 0.88921791, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.30786133, + "step": 1648, + "time_per_iteration": 2.6811182498931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083311, + "balance_loss_mlp": 1.05250716, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05150264807756507, + "language_loss": 0.83281147, + "learning_rate": 0.0007988106306961917, + "loss": 0.84364462, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.30761719, + "step": 1649, + "time_per_iteration": 3.132918119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_mlp": 1.04840076, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.0787550229152594, + "language_loss": 0.84213352, + "learning_rate": 0.0007985607847695014, + "loss": 0.85291457, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.29663086, + "step": 1650, + "time_per_iteration": 2.690056085586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04784608, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.0566788479410698, + "language_loss": 0.82883936, + "learning_rate": 0.0007983108229312345, + "loss": 0.83962488, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.30664062, + "step": 1651, + "time_per_iteration": 2.918217182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.04679036, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0674507609019882, + "language_loss": 0.86496019, + "learning_rate": 0.0007980607452784351, + "loss": 0.87573761, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.30908203, + "step": 1652, + "time_per_iteration": 2.5508391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_mlp": 1.052019, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.06063063486045483, + "language_loss": 0.90349394, + "learning_rate": 0.0007978105519081919, + "loss": 0.91431332, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.29858398, + "step": 1653, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079168, + "balance_loss_mlp": 1.04910302, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.0738675373878511, + "language_loss": 0.87538201, + "learning_rate": 0.0007975602429176385, + "loss": 0.88617373, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.30004883, + "step": 1654, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05356312, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.051475836139836105, + "language_loss": 0.81585073, + "learning_rate": 0.0007973098184039536, + "loss": 0.82669556, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.30883789, + "step": 1655, + "time_per_iteration": 2.66395902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_mlp": 1.05291927, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.059751712008043044, + "language_loss": 0.86801946, + "learning_rate": 0.0007970592784643602, + "loss": 0.87885141, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.30224609, + "step": 1656, + "time_per_iteration": 2.9186086654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_mlp": 1.05855238, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.07875703275612048, + "language_loss": 0.85285407, + "learning_rate": 0.0007968086231961272, + "loss": 0.86373335, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.29321289, + "step": 1657, + "time_per_iteration": 2.6505343914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089245, + "balance_loss_mlp": 1.05941832, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08653253817480935, + "language_loss": 0.8381049, + "learning_rate": 0.0007965578526965671, + "loss": 0.84899735, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.29785156, + "step": 1658, + "time_per_iteration": 2.5884180068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089397, + "balance_loss_mlp": 1.05995274, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.05523051502884026, + "language_loss": 0.86312473, + "learning_rate": 0.0007963069670630377, + "loss": 0.87401861, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.29394531, + "step": 1659, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089678, + "balance_loss_mlp": 1.05997133, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.06732717892338919, + "language_loss": 0.8810066, + "learning_rate": 0.0007960559663929416, + "loss": 0.89190334, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.29663086, + "step": 1660, + "time_per_iteration": 2.6370737552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.06633985, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.0532651376254825, + "language_loss": 0.87495023, + "learning_rate": 0.0007958048507837259, + "loss": 0.88591546, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.30151367, + "step": 1661, + "time_per_iteration": 2.942779779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_mlp": 1.06316066, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.07710421129836972, + "language_loss": 0.87092876, + "learning_rate": 0.0007955536203328822, + "loss": 0.8818627, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.30175781, + "step": 1662, + "time_per_iteration": 2.8991520404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100595, + "balance_loss_mlp": 1.07072091, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.05380031942726595, + "language_loss": 0.8344577, + "learning_rate": 0.0007953022751379469, + "loss": 0.84546363, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.2980957, + "step": 1663, + "time_per_iteration": 2.795117139816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_mlp": 1.07239294, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.0657811186180598, + "language_loss": 0.81884921, + "learning_rate": 0.000795050815296501, + "loss": 0.82987475, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.30151367, + "step": 1664, + "time_per_iteration": 2.969935894012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099283, + "balance_loss_mlp": 1.06890798, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.058736361347452894, + "language_loss": 0.93026185, + "learning_rate": 0.0007947992409061695, + "loss": 0.94125462, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.30322266, + "step": 1665, + "time_per_iteration": 2.585144281387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06182027, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05523611327933496, + "language_loss": 0.8654207, + "learning_rate": 0.0007945475520646226, + "loss": 0.87634689, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.30761719, + "step": 1666, + "time_per_iteration": 2.9349849224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092223, + "balance_loss_mlp": 1.06249237, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.05521997897435197, + "language_loss": 0.84546125, + "learning_rate": 0.0007942957488695743, + "loss": 0.85638344, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.296875, + "step": 1667, + "time_per_iteration": 2.6538572311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.0539664, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.05331163349230756, + "language_loss": 0.81038171, + "learning_rate": 0.0007940438314187833, + "loss": 0.82121915, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.29760742, + "step": 1668, + "time_per_iteration": 3.009927988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108075, + "balance_loss_mlp": 1.05016077, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.06087879277496283, + "language_loss": 0.80221838, + "learning_rate": 0.0007937917998100529, + "loss": 0.81302583, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.30541992, + "step": 1669, + "time_per_iteration": 2.5703017711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072786, + "balance_loss_mlp": 1.0426501, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.07064769089672658, + "language_loss": 0.78527176, + "learning_rate": 0.0007935396541412302, + "loss": 0.79599965, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.30102539, + "step": 1670, + "time_per_iteration": 2.625499725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081422, + "balance_loss_mlp": 1.05099988, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.0720065018777928, + "language_loss": 0.8546167, + "learning_rate": 0.0007932873945102068, + "loss": 0.86543095, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.30395508, + "step": 1671, + "time_per_iteration": 2.6188762187957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074685, + "balance_loss_mlp": 1.05713737, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.027722134190714592, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76836461, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.17578125, + "step": 1672, + "time_per_iteration": 4.9278037548065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081072, + "balance_loss_mlp": 1.05057812, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.053011814820585035, + "language_loss": 0.86121267, + "learning_rate": 0.0007927825337533461, + "loss": 0.87202334, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.3046875, + "step": 1673, + "time_per_iteration": 2.6787123680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075926, + "balance_loss_mlp": 1.0452652, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06681709765508774, + "language_loss": 0.84770656, + "learning_rate": 0.0007925299328235131, + "loss": 0.85846579, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.30615234, + "step": 1674, + "time_per_iteration": 2.638434410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080022, + "balance_loss_mlp": 1.04890847, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.06949369164102485, + "language_loss": 0.84795958, + "learning_rate": 0.000792277218323488, + "loss": 0.85875976, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.31103516, + "step": 1675, + "time_per_iteration": 2.5852880477905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04653537, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.06490362841252771, + "language_loss": 0.84737194, + "learning_rate": 0.0007920243903513833, + "loss": 0.85814989, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.31225586, + "step": 1676, + "time_per_iteration": 2.558058261871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_mlp": 1.0523684, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.0667244817356676, + "language_loss": 0.83645618, + "learning_rate": 0.0007917714490053556, + "loss": 0.84729266, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.3125, + "step": 1677, + "time_per_iteration": 2.6619315147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.05046487, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.05833648566333407, + "language_loss": 0.85744321, + "learning_rate": 0.0007915183943836055, + "loss": 0.8682673, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.31933594, + "step": 1678, + "time_per_iteration": 2.8658525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04729617, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.06725353636254193, + "language_loss": 0.84315777, + "learning_rate": 0.0007912652265843773, + "loss": 0.8539505, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.31958008, + "step": 1679, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_mlp": 1.05019951, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.062193961969532426, + "language_loss": 0.81564045, + "learning_rate": 0.0007910119457059597, + "loss": 0.82647079, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.32836914, + "step": 1680, + "time_per_iteration": 2.6963257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05333161, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.0682304205879652, + "language_loss": 0.80304003, + "learning_rate": 0.0007907585518466849, + "loss": 0.81389421, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.32080078, + "step": 1681, + "time_per_iteration": 2.969540596008301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081665, + "balance_loss_mlp": 1.05026531, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.06175447283803796, + "language_loss": 0.89361274, + "learning_rate": 0.000790505045104929, + "loss": 0.90442938, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.3137207, + "step": 1682, + "time_per_iteration": 2.5148813724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_mlp": 1.05108356, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.061424377243362256, + "language_loss": 0.87097234, + "learning_rate": 0.0007902514255791125, + "loss": 0.88180125, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.31787109, + "step": 1683, + "time_per_iteration": 2.7773754596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078151, + "balance_loss_mlp": 1.04696608, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.06766194852988328, + "language_loss": 0.87911332, + "learning_rate": 0.0007899976933676986, + "loss": 0.88989484, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.31176758, + "step": 1684, + "time_per_iteration": 2.9700520038604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_mlp": 1.04589295, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.061649412189834635, + "language_loss": 0.87300712, + "learning_rate": 0.0007897438485691955, + "loss": 0.88378721, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.32104492, + "step": 1685, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04483223, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.06379930216662907, + "language_loss": 0.823452, + "learning_rate": 0.0007894898912821542, + "loss": 0.83422434, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.32397461, + "step": 1686, + "time_per_iteration": 2.5478906631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071757, + "balance_loss_mlp": 1.03978539, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.05321818652056826, + "language_loss": 0.86522776, + "learning_rate": 0.0007892358216051695, + "loss": 0.87594533, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.31958008, + "step": 1687, + "time_per_iteration": 2.735633134841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075777, + "balance_loss_mlp": 1.04251742, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.0608133700269358, + "language_loss": 0.91922832, + "learning_rate": 0.0007889816396368803, + "loss": 0.92998612, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.33276367, + "step": 1688, + "time_per_iteration": 2.6234939098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077878, + "balance_loss_mlp": 1.04497576, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.0630363811740232, + "language_loss": 0.85370868, + "learning_rate": 0.0007887273454759687, + "loss": 0.86448747, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.32910156, + "step": 1689, + "time_per_iteration": 2.4698379039764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074583, + "balance_loss_mlp": 1.04184794, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.06604183912716106, + "language_loss": 0.82445431, + "learning_rate": 0.0007884729392211603, + "loss": 0.83520007, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.32739258, + "step": 1690, + "time_per_iteration": 2.6488864421844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.04920113, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06849578130600678, + "language_loss": 0.85280114, + "learning_rate": 0.0007882184209712245, + "loss": 0.86361718, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.32397461, + "step": 1691, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080531, + "balance_loss_mlp": 1.04874992, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.06225581397596747, + "language_loss": 0.8573736, + "learning_rate": 0.000787963790824974, + "loss": 0.8681789, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.31762695, + "step": 1692, + "time_per_iteration": 2.9696617126464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06054115, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.0857009989212748, + "language_loss": 0.89660913, + "learning_rate": 0.0007877090488812651, + "loss": 0.90753233, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.31762695, + "step": 1693, + "time_per_iteration": 2.431861639022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_mlp": 1.05553031, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.07076453254267401, + "language_loss": 0.8368417, + "learning_rate": 0.0007874541952389973, + "loss": 0.84770912, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.31176758, + "step": 1694, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_mlp": 1.05293202, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.060562687008333366, + "language_loss": 0.86582285, + "learning_rate": 0.0007871992299971136, + "loss": 0.87666881, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.31640625, + "step": 1695, + "time_per_iteration": 2.553171396255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_mlp": 1.0608871, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.05969457295977618, + "language_loss": 0.84301764, + "learning_rate": 0.0007869441532546001, + "loss": 0.85394001, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.31323242, + "step": 1696, + "time_per_iteration": 2.752049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.06247652, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05927141137383595, + "language_loss": 0.79686946, + "learning_rate": 0.0007866889651104867, + "loss": 0.80780673, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.31225586, + "step": 1697, + "time_per_iteration": 2.7691686153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109533, + "balance_loss_mlp": 1.06388259, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.0715366482234757, + "language_loss": 0.83218181, + "learning_rate": 0.000786433665663846, + "loss": 0.84313512, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.31420898, + "step": 1698, + "time_per_iteration": 2.717372179031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098821, + "balance_loss_mlp": 1.06816053, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.05645489658390659, + "language_loss": 0.86431837, + "learning_rate": 0.0007861782550137942, + "loss": 0.87530661, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.30615234, + "step": 1699, + "time_per_iteration": 2.9035465717315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_mlp": 1.07394195, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.11170286971508382, + "language_loss": 0.85853553, + "learning_rate": 0.0007859227332594901, + "loss": 0.86957312, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.29785156, + "step": 1700, + "time_per_iteration": 2.9302797317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093978, + "balance_loss_mlp": 1.06508183, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.07200471053268022, + "language_loss": 0.84801477, + "learning_rate": 0.0007856671005001365, + "loss": 0.85895455, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.28881836, + "step": 1701, + "time_per_iteration": 3.1760013103485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090985, + "balance_loss_mlp": 1.06225514, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.07453437515979243, + "language_loss": 0.81870627, + "learning_rate": 0.0007854113568349787, + "loss": 0.82961613, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.28686523, + "step": 1702, + "time_per_iteration": 3.1038365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_mlp": 1.05770779, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.07528598974040544, + "language_loss": 0.80317354, + "learning_rate": 0.0007851555023633052, + "loss": 0.81405228, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.30102539, + "step": 1703, + "time_per_iteration": 2.847515106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.0558784, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.08040178147570827, + "language_loss": 0.82301831, + "learning_rate": 0.0007848995371844474, + "loss": 0.83387053, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.29296875, + "step": 1704, + "time_per_iteration": 2.5442426204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098029, + "balance_loss_mlp": 1.06872725, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06101842979524802, + "language_loss": 0.80441558, + "learning_rate": 0.0007846434613977801, + "loss": 0.81539583, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.29296875, + "step": 1705, + "time_per_iteration": 2.5023465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091561, + "balance_loss_mlp": 1.06242633, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.07007502801083235, + "language_loss": 0.78621399, + "learning_rate": 0.0007843872751027203, + "loss": 0.79712963, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.29125977, + "step": 1706, + "time_per_iteration": 2.790001392364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094895, + "balance_loss_mlp": 1.06549811, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.05836443006497643, + "language_loss": 0.87259293, + "learning_rate": 0.0007841309783987287, + "loss": 0.88354194, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.29345703, + "step": 1707, + "time_per_iteration": 2.7478153705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097713, + "balance_loss_mlp": 1.0684588, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.05888352709782848, + "language_loss": 0.89055538, + "learning_rate": 0.0007838745713853084, + "loss": 0.90153247, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.29199219, + "step": 1708, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088275, + "balance_loss_mlp": 1.05925906, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.06397878577513526, + "language_loss": 0.8386358, + "learning_rate": 0.0007836180541620053, + "loss": 0.8495186, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.29003906, + "step": 1709, + "time_per_iteration": 2.7023067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_mlp": 1.06191421, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.05521592697878337, + "language_loss": 0.86435962, + "learning_rate": 0.0007833614268284082, + "loss": 0.87527102, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.29199219, + "step": 1710, + "time_per_iteration": 2.538080930709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_mlp": 1.0721513, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.029520146980468998, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75200427, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.18457031, + "step": 1711, + "time_per_iteration": 4.909448862075806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05965161, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.07803051984240059, + "language_loss": 0.78501904, + "learning_rate": 0.0007828478422289016, + "loss": 0.79591095, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.29492188, + "step": 1712, + "time_per_iteration": 2.5883195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092173, + "balance_loss_mlp": 1.06210816, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05953292046858541, + "language_loss": 0.88987601, + "learning_rate": 0.0007825908851623833, + "loss": 0.90079772, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.30004883, + "step": 1713, + "time_per_iteration": 2.7441718578338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089127, + "balance_loss_mlp": 1.05973005, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06609176393308323, + "language_loss": 0.8478905, + "learning_rate": 0.0007823338183843533, + "loss": 0.85878181, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.29394531, + "step": 1714, + "time_per_iteration": 2.6771602630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.06291747, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.10875146541446083, + "language_loss": 0.80569458, + "learning_rate": 0.0007820766419946141, + "loss": 0.81661701, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.29321289, + "step": 1715, + "time_per_iteration": 3.3068225383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_mlp": 1.07052732, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.03503617860008252, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760461, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.17480469, + "step": 1716, + "time_per_iteration": 5.048320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_mlp": 1.06201911, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.06576145610663801, + "language_loss": 0.76379126, + "learning_rate": 0.0007815619607794288, + "loss": 0.77470231, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.29052734, + "step": 1717, + "time_per_iteration": 2.6151187419891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094733, + "balance_loss_mlp": 1.06440604, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.08930544150493325, + "language_loss": 0.82491159, + "learning_rate": 0.0007813044561538001, + "loss": 0.835859, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.30273438, + "step": 1718, + "time_per_iteration": 3.1329195499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089209, + "balance_loss_mlp": 1.05928707, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.06440748712139703, + "language_loss": 0.88832355, + "learning_rate": 0.0007810468423160958, + "loss": 0.8992157, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.29882812, + "step": 1719, + "time_per_iteration": 2.8785343170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_mlp": 1.06195092, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.05842798757545397, + "language_loss": 0.81825691, + "learning_rate": 0.0007807891193663306, + "loss": 0.82917207, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.29492188, + "step": 1720, + "time_per_iteration": 2.775949478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.05956948, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.1056737351826848, + "language_loss": 0.82154363, + "learning_rate": 0.0007805312874045614, + "loss": 0.83243477, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.29516602, + "step": 1721, + "time_per_iteration": 2.528573513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.06054103, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.06879892565652022, + "language_loss": 0.86894739, + "learning_rate": 0.0007802733465308874, + "loss": 0.87984586, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.29272461, + "step": 1722, + "time_per_iteration": 2.4575133323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.05811512, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.06801648197756033, + "language_loss": 0.84311831, + "learning_rate": 0.0007800152968454501, + "loss": 0.85398912, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.28930664, + "step": 1723, + "time_per_iteration": 2.729114294052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_mlp": 1.06300533, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.049597969001903774, + "language_loss": 0.90648681, + "learning_rate": 0.0007797571384484334, + "loss": 0.91740465, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.28759766, + "step": 1724, + "time_per_iteration": 2.8813512325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_mlp": 1.05463219, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.060917196813517045, + "language_loss": 0.91917408, + "learning_rate": 0.0007794988714400633, + "loss": 0.9300158, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.29516602, + "step": 1725, + "time_per_iteration": 2.6094837188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_mlp": 1.05896294, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.06883363868640566, + "language_loss": 0.85331756, + "learning_rate": 0.0007792404959206079, + "loss": 0.86420023, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.29272461, + "step": 1726, + "time_per_iteration": 2.4982993602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_mlp": 1.05396366, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.0595205364190525, + "language_loss": 0.81498575, + "learning_rate": 0.0007789820119903774, + "loss": 0.82581604, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.29052734, + "step": 1727, + "time_per_iteration": 2.9797775745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04043114, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.028746370774938412, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552454, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.19335938, + "step": 1728, + "time_per_iteration": 4.892562627792358 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_mlp": 1.05982828, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.10868743625457102, + "language_loss": 0.83712173, + "learning_rate": 0.0007784647192990428, + "loss": 0.84802401, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.3034668, + "step": 1729, + "time_per_iteration": 2.721163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093021, + "balance_loss_mlp": 1.06283677, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.06834187729314575, + "language_loss": 0.80591226, + "learning_rate": 0.0007782059107387696, + "loss": 0.81684244, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.30151367, + "step": 1730, + "time_per_iteration": 2.8358583450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097893, + "balance_loss_mlp": 1.06768548, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.06518025115488765, + "language_loss": 0.88646144, + "learning_rate": 0.0007779469941693826, + "loss": 0.89744031, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.30175781, + "step": 1731, + "time_per_iteration": 2.8069489002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105874, + "balance_loss_mlp": 1.0744741, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.0738487456517703, + "language_loss": 0.76712036, + "learning_rate": 0.0007776879696914029, + "loss": 0.77817911, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.3137207, + "step": 1732, + "time_per_iteration": 2.8068690299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08479202, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.06155067702851775, + "language_loss": 0.88390094, + "learning_rate": 0.000777428837405392, + "loss": 0.89506716, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.31811523, + "step": 1733, + "time_per_iteration": 2.8412673473358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_mlp": 1.07530773, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.0682339524169846, + "language_loss": 0.86804128, + "learning_rate": 0.0007771695974119544, + "loss": 0.87911332, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.31884766, + "step": 1734, + "time_per_iteration": 2.512354612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_mlp": 1.07159579, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.0845052703087739, + "language_loss": 0.75201118, + "learning_rate": 0.0007769102498117359, + "loss": 0.7630502, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.32299805, + "step": 1735, + "time_per_iteration": 3.107100248336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_mlp": 1.05777764, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.061332510780765306, + "language_loss": 0.79977, + "learning_rate": 0.000776650794705424, + "loss": 0.81067985, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33227539, + "step": 1736, + "time_per_iteration": 3.259875535964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092848, + "balance_loss_mlp": 1.06116199, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.05236613872795896, + "language_loss": 0.82229674, + "learning_rate": 0.0007763912321937483, + "loss": 0.83322519, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.31665039, + "step": 1737, + "time_per_iteration": 2.704059600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088373, + "balance_loss_mlp": 1.05506587, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.07890071498287932, + "language_loss": 0.82297349, + "learning_rate": 0.0007761315623774799, + "loss": 0.83385718, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33325195, + "step": 1738, + "time_per_iteration": 3.399148464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.0574522, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.09967891290955513, + "language_loss": 0.87632757, + "learning_rate": 0.0007758717853574313, + "loss": 0.88722181, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.31958008, + "step": 1739, + "time_per_iteration": 2.772089958190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103829, + "balance_loss_mlp": 1.0729773, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06672668023604937, + "language_loss": 0.90074134, + "learning_rate": 0.0007756119012344571, + "loss": 0.91177964, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.30810547, + "step": 1740, + "time_per_iteration": 2.5482232570648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_mlp": 1.07707, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.07840140242610649, + "language_loss": 0.84438574, + "learning_rate": 0.0007753519101094535, + "loss": 0.85546857, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.31176758, + "step": 1741, + "time_per_iteration": 2.749004602432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_mlp": 1.07173228, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.07002932741488781, + "language_loss": 0.86241812, + "learning_rate": 0.0007750918120833575, + "loss": 0.87343943, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.3034668, + "step": 1742, + "time_per_iteration": 2.600731611251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_mlp": 1.0753479, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.07258867640739639, + "language_loss": 0.87368989, + "learning_rate": 0.0007748316072571485, + "loss": 0.88474762, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.30395508, + "step": 1743, + "time_per_iteration": 2.7698371410369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_mlp": 1.07902408, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.05763877458348602, + "language_loss": 0.79041934, + "learning_rate": 0.0007745712957318467, + "loss": 0.80151671, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.30664062, + "step": 1744, + "time_per_iteration": 2.967310667037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_mlp": 1.07412386, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.052786515694630796, + "language_loss": 0.86410165, + "learning_rate": 0.0007743108776085141, + "loss": 0.87514448, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.30102539, + "step": 1745, + "time_per_iteration": 2.771803855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_mlp": 1.07049131, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.06089020802257528, + "language_loss": 0.82798052, + "learning_rate": 0.0007740503529882543, + "loss": 0.83900565, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.32006836, + "step": 1746, + "time_per_iteration": 2.805392026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095402, + "balance_loss_mlp": 1.064551, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.0569869068698716, + "language_loss": 0.90718448, + "learning_rate": 0.0007737897219722114, + "loss": 0.9181385, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.30810547, + "step": 1747, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.05970204, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.07943976371979472, + "language_loss": 0.80688596, + "learning_rate": 0.0007735289846615716, + "loss": 0.81779456, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.31152344, + "step": 1748, + "time_per_iteration": 2.6637260913848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094297, + "balance_loss_mlp": 1.06356478, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.06884386609789231, + "language_loss": 0.81979561, + "learning_rate": 0.0007732681411575621, + "loss": 0.83073854, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.30712891, + "step": 1749, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.0555166, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.052237930998467595, + "language_loss": 0.87234819, + "learning_rate": 0.0007730071915614514, + "loss": 0.88321906, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.31542969, + "step": 1750, + "time_per_iteration": 2.707857370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_mlp": 1.05896115, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.08336153438972979, + "language_loss": 0.88963622, + "learning_rate": 0.0007727461359745489, + "loss": 0.90053463, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.30859375, + "step": 1751, + "time_per_iteration": 2.482837438583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093668, + "balance_loss_mlp": 1.06307864, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05330176149069141, + "language_loss": 0.86016554, + "learning_rate": 0.0007724849744982056, + "loss": 0.87110221, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.30541992, + "step": 1752, + "time_per_iteration": 2.690420389175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097033, + "balance_loss_mlp": 1.06668198, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.0643678921459399, + "language_loss": 0.81981385, + "learning_rate": 0.0007722237072338131, + "loss": 0.8307842, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.30322266, + "step": 1753, + "time_per_iteration": 2.7154347896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097395, + "balance_loss_mlp": 1.06694901, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.07107791288081117, + "language_loss": 0.85213387, + "learning_rate": 0.0007719623342828046, + "loss": 0.8631078, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.30419922, + "step": 1754, + "time_per_iteration": 2.5009355545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109586, + "balance_loss_mlp": 1.06426978, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.06326183968549627, + "language_loss": 0.84134084, + "learning_rate": 0.000771700855746654, + "loss": 0.85229945, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.31567383, + "step": 1755, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082281, + "balance_loss_mlp": 1.05071473, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.06130822269954804, + "language_loss": 0.88395244, + "learning_rate": 0.0007714392717268763, + "loss": 0.89477527, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.31542969, + "step": 1756, + "time_per_iteration": 2.6147336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_mlp": 1.05219221, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.05731341996908033, + "language_loss": 0.86388242, + "learning_rate": 0.0007711775823250273, + "loss": 0.87471741, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.31298828, + "step": 1757, + "time_per_iteration": 2.5304934978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085861, + "balance_loss_mlp": 1.05455685, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.061357664780502266, + "language_loss": 0.83481395, + "learning_rate": 0.0007709157876427039, + "loss": 0.84567261, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.31274414, + "step": 1758, + "time_per_iteration": 3.1116981506347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_mlp": 1.04189849, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0592835704233285, + "language_loss": 0.85574573, + "learning_rate": 0.0007706538877815439, + "loss": 0.86648774, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.32299805, + "step": 1759, + "time_per_iteration": 2.635298728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_mlp": 1.04730105, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.04672826561746397, + "language_loss": 0.83449262, + "learning_rate": 0.0007703918828432259, + "loss": 0.84527004, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.30419922, + "step": 1760, + "time_per_iteration": 2.664783477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071091, + "balance_loss_mlp": 1.04023945, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.061026274734732225, + "language_loss": 0.88914752, + "learning_rate": 0.000770129772929469, + "loss": 0.89985847, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.30810547, + "step": 1761, + "time_per_iteration": 2.7082738876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_mlp": 1.03914273, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.058866792995701266, + "language_loss": 0.88234216, + "learning_rate": 0.0007698675581420334, + "loss": 0.89304519, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.3112793, + "step": 1762, + "time_per_iteration": 2.9119746685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.03966177, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.06738514708484569, + "language_loss": 0.78819811, + "learning_rate": 0.0007696052385827199, + "loss": 0.79890805, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.31298828, + "step": 1763, + "time_per_iteration": 2.9451980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107403, + "balance_loss_mlp": 1.04172421, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.0719800357998311, + "language_loss": 0.78192145, + "learning_rate": 0.00076934281435337, + "loss": 0.79266179, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.32299805, + "step": 1764, + "time_per_iteration": 2.8267600536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.03931201, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.06414673033674093, + "language_loss": 0.85701221, + "learning_rate": 0.0007690802855558658, + "loss": 0.86773127, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.32592773, + "step": 1765, + "time_per_iteration": 2.8825321197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_mlp": 1.04322386, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.027152559638010845, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.7743544, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.17285156, + "step": 1766, + "time_per_iteration": 4.890359401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04684353, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.06170687350837257, + "language_loss": 0.89089799, + "learning_rate": 0.0007685549146641262, + "loss": 0.90168703, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.32055664, + "step": 1767, + "time_per_iteration": 2.539238691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.04557216, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05571629344022593, + "language_loss": 0.8822673, + "learning_rate": 0.0007682920727738579, + "loss": 0.89303821, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.31494141, + "step": 1768, + "time_per_iteration": 2.512801170349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.04931498, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06175400371418068, + "language_loss": 0.8474735, + "learning_rate": 0.000768029126723369, + "loss": 0.85827971, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.31274414, + "step": 1769, + "time_per_iteration": 2.5238869190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075433, + "balance_loss_mlp": 1.04515338, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.06596681609056877, + "language_loss": 0.81544566, + "learning_rate": 0.0007677660766147447, + "loss": 0.82620001, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.30224609, + "step": 1770, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_mlp": 1.02063394, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.014856007486746849, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73508459, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.16894531, + "step": 1771, + "time_per_iteration": 4.967731475830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05113387, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.075322249241395, + "language_loss": 0.79792535, + "learning_rate": 0.0007672396646316306, + "loss": 0.8087405, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.30322266, + "step": 1772, + "time_per_iteration": 2.524365186691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_mlp": 1.05451918, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.05910937608565349, + "language_loss": 0.80291271, + "learning_rate": 0.000766976302961512, + "loss": 0.81376183, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.30371094, + "step": 1773, + "time_per_iteration": 3.002929925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_mlp": 1.0563519, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.0625889066862488, + "language_loss": 0.81081951, + "learning_rate": 0.0007667128376420003, + "loss": 0.82168746, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.30395508, + "step": 1774, + "time_per_iteration": 2.5821964740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_mlp": 1.05336761, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.06267075227744807, + "language_loss": 0.84329379, + "learning_rate": 0.0007664492687753817, + "loss": 0.85412979, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.30175781, + "step": 1775, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04769528, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.054581176728495925, + "language_loss": 0.81518859, + "learning_rate": 0.000766185596463983, + "loss": 0.8259607, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.29516602, + "step": 1776, + "time_per_iteration": 2.655543804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_mlp": 1.04993343, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.06969464274274284, + "language_loss": 0.76725864, + "learning_rate": 0.0007659218208101706, + "loss": 0.77804863, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.29003906, + "step": 1777, + "time_per_iteration": 3.1378567218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06411862, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.0529989301900612, + "language_loss": 0.84699291, + "learning_rate": 0.0007656579419163515, + "loss": 0.85792446, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.29052734, + "step": 1778, + "time_per_iteration": 2.8120994567871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091459, + "balance_loss_mlp": 1.06239629, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.06282493199141514, + "language_loss": 0.76994503, + "learning_rate": 0.0007653939598849724, + "loss": 0.78085959, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.2902832, + "step": 1779, + "time_per_iteration": 2.5995492935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.07051396, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.04507156484415478, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83967406, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16699219, + "step": 1780, + "time_per_iteration": 4.9175097942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_mlp": 1.07186341, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.05745476314946865, + "language_loss": 0.79740059, + "learning_rate": 0.000764865686819522, + "loss": 0.80842102, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.30151367, + "step": 1781, + "time_per_iteration": 3.1022064685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.06907511, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.061017866945560745, + "language_loss": 0.85627258, + "learning_rate": 0.0007646013959905449, + "loss": 0.8672511, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.28759766, + "step": 1782, + "time_per_iteration": 2.625312566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090603, + "balance_loss_mlp": 1.06030035, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05493462983431466, + "language_loss": 0.80768538, + "learning_rate": 0.0007643370024341949, + "loss": 0.81859136, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.30249023, + "step": 1783, + "time_per_iteration": 3.1206953525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_mlp": 1.06284761, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.04934338548004703, + "language_loss": 0.8289808, + "learning_rate": 0.0007640725062531195, + "loss": 0.83990133, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.29174805, + "step": 1784, + "time_per_iteration": 2.518277645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092006, + "balance_loss_mlp": 1.06165504, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.061838155255473454, + "language_loss": 0.8616311, + "learning_rate": 0.0007638079075500047, + "loss": 0.8725512, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.30297852, + "step": 1785, + "time_per_iteration": 2.566340684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056366, + "balance_loss_mlp": 1.04101145, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.03141321768780463, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76237035, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.15332031, + "step": 1786, + "time_per_iteration": 4.984891891479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_mlp": 1.05088782, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.0502662811310507, + "language_loss": 0.83153242, + "learning_rate": 0.0007632784029886026, + "loss": 0.84235144, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.30981445, + "step": 1787, + "time_per_iteration": 2.6574935913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_mlp": 1.04832625, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.058652751735253, + "language_loss": 0.85391539, + "learning_rate": 0.0007630134973358873, + "loss": 0.86470503, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.3059082, + "step": 1788, + "time_per_iteration": 2.920311450958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_mlp": 1.05702209, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05633660644162356, + "language_loss": 0.86888337, + "learning_rate": 0.0007627484895722763, + "loss": 0.87976426, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.31030273, + "step": 1789, + "time_per_iteration": 2.648061513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.05268025, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.08125120447961011, + "language_loss": 0.79987907, + "learning_rate": 0.0007624833798006552, + "loss": 0.8107022, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.29614258, + "step": 1790, + "time_per_iteration": 3.083303689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_mlp": 1.05249596, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.06337905919609309, + "language_loss": 0.83924425, + "learning_rate": 0.0007622181681239483, + "loss": 0.85006905, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.29931641, + "step": 1791, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_mlp": 1.04677427, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.05139164694864183, + "language_loss": 0.84563744, + "learning_rate": 0.0007619528546451202, + "loss": 0.85641772, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.31225586, + "step": 1792, + "time_per_iteration": 2.7847092151641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.05183685, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.060391852587241154, + "language_loss": 0.8357141, + "learning_rate": 0.0007616874394671745, + "loss": 0.84653878, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.3059082, + "step": 1793, + "time_per_iteration": 3.3427343368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05632687, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.07229882199780847, + "language_loss": 0.85033429, + "learning_rate": 0.0007614219226931547, + "loss": 0.86121154, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.3137207, + "step": 1794, + "time_per_iteration": 2.6797611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090025, + "balance_loss_mlp": 1.05931664, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.057715322830613675, + "language_loss": 0.84206641, + "learning_rate": 0.0007611563044261435, + "loss": 0.85296667, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.30664062, + "step": 1795, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_mlp": 1.05543017, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.06328741897936851, + "language_loss": 0.86560625, + "learning_rate": 0.0007608905847692631, + "loss": 0.87647337, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.3125, + "step": 1796, + "time_per_iteration": 2.472182035446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081946, + "balance_loss_mlp": 1.05014098, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.053847624873276365, + "language_loss": 0.86582637, + "learning_rate": 0.0007606247638256749, + "loss": 0.8766458, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.31787109, + "step": 1797, + "time_per_iteration": 2.842547655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147955, + "balance_loss_mlp": 1.13145602, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.06482996241123744, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79318249, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.16503906, + "step": 1798, + "time_per_iteration": 4.918993949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075567, + "balance_loss_mlp": 1.06011796, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.04230684388330953, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80402768, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.15429688, + "step": 1799, + "time_per_iteration": 4.791706323623657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.04724216, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.06124115711212235, + "language_loss": 0.85762143, + "learning_rate": 0.0007598266943068686, + "loss": 0.86839759, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.30322266, + "step": 1800, + "time_per_iteration": 2.743213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_mlp": 1.05266404, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.13184352245004016, + "language_loss": 0.83900499, + "learning_rate": 0.0007595604692488507, + "loss": 0.84984374, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31176758, + "step": 1801, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05105186, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.0617697315453188, + "language_loss": 0.82875979, + "learning_rate": 0.0007592941434205215, + "loss": 0.83958554, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.31494141, + "step": 1802, + "time_per_iteration": 2.803941488265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_mlp": 1.06292093, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.03209988868756776, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74648476, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.14453125, + "step": 1803, + "time_per_iteration": 5.115894794464111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073735, + "balance_loss_mlp": 1.04176331, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.057797440709038125, + "language_loss": 0.7980904, + "learning_rate": 0.0007587611898665566, + "loss": 0.80882776, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.31958008, + "step": 1804, + "time_per_iteration": 3.0783464908599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_mlp": 1.04958522, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.052922401600576395, + "language_loss": 0.8228178, + "learning_rate": 0.0007584945623478315, + "loss": 0.83362216, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.30810547, + "step": 1805, + "time_per_iteration": 2.8341996669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107388, + "balance_loss_mlp": 1.04178858, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.05986711270473425, + "language_loss": 0.81165981, + "learning_rate": 0.000758227834472617, + "loss": 0.82239866, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32080078, + "step": 1806, + "time_per_iteration": 3.0486085414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.04971278, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.06433807190471491, + "language_loss": 0.77163357, + "learning_rate": 0.0007579610063444664, + "loss": 0.78245926, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.32861328, + "step": 1807, + "time_per_iteration": 2.7597365379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073013, + "balance_loss_mlp": 1.04068375, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.06573509148212295, + "language_loss": 0.8740322, + "learning_rate": 0.0007576940780669712, + "loss": 0.88476229, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32324219, + "step": 1808, + "time_per_iteration": 3.2193737030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.04060304, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.07068655640298144, + "language_loss": 0.84018815, + "learning_rate": 0.0007574270497437624, + "loss": 0.85092652, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33251953, + "step": 1809, + "time_per_iteration": 2.958071708679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04255509, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.05267537563651592, + "language_loss": 0.88190216, + "learning_rate": 0.000757159921478509, + "loss": 0.89264333, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.31542969, + "step": 1810, + "time_per_iteration": 2.743820905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_mlp": 1.10993648, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.032772528197798495, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75575733, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.15136719, + "step": 1811, + "time_per_iteration": 4.734825372695923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_mlp": 1.04713607, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.06138203683055377, + "language_loss": 0.87334222, + "learning_rate": 0.0007566253655367423, + "loss": 0.88411689, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.30273438, + "step": 1812, + "time_per_iteration": 2.5963358879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.04946637, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.05073723218815133, + "language_loss": 0.89626348, + "learning_rate": 0.000756357938067762, + "loss": 0.90707672, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.31835938, + "step": 1813, + "time_per_iteration": 2.6791560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088512, + "balance_loss_mlp": 1.05615854, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.07107132576327291, + "language_loss": 0.82739902, + "learning_rate": 0.0007560904110718033, + "loss": 0.83828408, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32324219, + "step": 1814, + "time_per_iteration": 3.251187801361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05244136, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.056660731031110724, + "language_loss": 0.83390886, + "learning_rate": 0.0007558227846527297, + "loss": 0.84475422, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.32080078, + "step": 1815, + "time_per_iteration": 2.852786064147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086181, + "balance_loss_mlp": 1.05358887, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.06752757018776132, + "language_loss": 0.83192128, + "learning_rate": 0.0007555550589144429, + "loss": 0.84278309, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.32592773, + "step": 1816, + "time_per_iteration": 2.4226694107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108673, + "balance_loss_mlp": 1.05568814, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.05637535729014081, + "language_loss": 0.84440207, + "learning_rate": 0.000755287233960883, + "loss": 0.85526937, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.31005859, + "step": 1817, + "time_per_iteration": 2.556528329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081988, + "balance_loss_mlp": 1.04963493, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06861190177202381, + "language_loss": 0.77555025, + "learning_rate": 0.0007550193098960292, + "loss": 0.7863701, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32348633, + "step": 1818, + "time_per_iteration": 2.9168636798858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_mlp": 1.04902124, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.04890635253674866, + "language_loss": 0.85897982, + "learning_rate": 0.0007547512868238988, + "loss": 0.86979043, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.3203125, + "step": 1819, + "time_per_iteration": 3.147949695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_mlp": 1.05583739, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.07359678742691168, + "language_loss": 0.83527619, + "learning_rate": 0.0007544831648485473, + "loss": 0.84614623, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.3112793, + "step": 1820, + "time_per_iteration": 2.683906078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_mlp": 1.05272126, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.07119738396785501, + "language_loss": 0.81087327, + "learning_rate": 0.0007542149440740694, + "loss": 0.82171333, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.3125, + "step": 1821, + "time_per_iteration": 2.738029718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107983, + "balance_loss_mlp": 1.04850197, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.07229829340096756, + "language_loss": 0.8569001, + "learning_rate": 0.000753946624604597, + "loss": 0.86769843, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.31298828, + "step": 1822, + "time_per_iteration": 2.7263731956481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079169, + "balance_loss_mlp": 1.04795969, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.05660966900473529, + "language_loss": 0.87968546, + "learning_rate": 0.0007536782065443015, + "loss": 0.89047718, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.31176758, + "step": 1823, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_mlp": 1.05386138, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06227259781784348, + "language_loss": 0.74483079, + "learning_rate": 0.0007534096899973919, + "loss": 0.75567335, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.3034668, + "step": 1824, + "time_per_iteration": 2.609548807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_mlp": 1.04804349, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05520550621954613, + "language_loss": 0.82636261, + "learning_rate": 0.0007531410750681154, + "loss": 0.83715534, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.31201172, + "step": 1825, + "time_per_iteration": 2.7306325435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094474, + "balance_loss_mlp": 1.06352782, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.04890512262044313, + "language_loss": 0.86351258, + "learning_rate": 0.0007528723618607575, + "loss": 0.8744573, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.30908203, + "step": 1826, + "time_per_iteration": 3.4343338012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088582, + "balance_loss_mlp": 1.05782557, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.05382597898667073, + "language_loss": 0.82364488, + "learning_rate": 0.0007526035504796422, + "loss": 0.83453071, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.30737305, + "step": 1827, + "time_per_iteration": 2.7783889770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_mlp": 1.05721426, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.07196751046410012, + "language_loss": 0.86701363, + "learning_rate": 0.0007523346410291312, + "loss": 0.87790149, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.31542969, + "step": 1828, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096578, + "balance_loss_mlp": 1.06434393, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.05953464089235074, + "language_loss": 0.84491026, + "learning_rate": 0.0007520656336136245, + "loss": 0.85587609, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32226562, + "step": 1829, + "time_per_iteration": 2.9498770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095972, + "balance_loss_mlp": 1.0648104, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.05500553487662277, + "language_loss": 0.87983966, + "learning_rate": 0.0007517965283375599, + "loss": 0.89079928, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.3112793, + "step": 1830, + "time_per_iteration": 2.838120698928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097926, + "balance_loss_mlp": 1.06566763, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.053691241766720514, + "language_loss": 0.89336729, + "learning_rate": 0.0007515273253054132, + "loss": 0.90434659, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32250977, + "step": 1831, + "time_per_iteration": 2.6600866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092956, + "balance_loss_mlp": 1.06191444, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.05928754583625919, + "language_loss": 0.82674569, + "learning_rate": 0.0007512580246216988, + "loss": 0.83767527, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.31005859, + "step": 1832, + "time_per_iteration": 2.7806639671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089641, + "balance_loss_mlp": 1.05752611, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.0631616677310412, + "language_loss": 0.84810489, + "learning_rate": 0.000750988626390968, + "loss": 0.85900134, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32104492, + "step": 1833, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087885, + "balance_loss_mlp": 1.0560801, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.053730319302775706, + "language_loss": 0.84857321, + "learning_rate": 0.0007507191307178108, + "loss": 0.85945207, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.31787109, + "step": 1834, + "time_per_iteration": 2.822472095489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05785227, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.07238185360826516, + "language_loss": 0.74172056, + "learning_rate": 0.0007504495377068543, + "loss": 0.75260878, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.30932617, + "step": 1835, + "time_per_iteration": 2.758622884750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.06250441, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06860617015764896, + "language_loss": 0.81217551, + "learning_rate": 0.0007501798474627642, + "loss": 0.82311678, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.31591797, + "step": 1836, + "time_per_iteration": 2.932610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.06568563, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.06442397939494823, + "language_loss": 0.83527768, + "learning_rate": 0.0007499100600902433, + "loss": 0.8462323, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.29736328, + "step": 1837, + "time_per_iteration": 3.0089991092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089306, + "balance_loss_mlp": 1.05845428, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06893251529793973, + "language_loss": 0.83798671, + "learning_rate": 0.0007496401756940324, + "loss": 0.84887969, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.30810547, + "step": 1838, + "time_per_iteration": 2.6746418476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.06029606, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.06403380726847299, + "language_loss": 0.82561135, + "learning_rate": 0.0007493701943789098, + "loss": 0.83651948, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.3046875, + "step": 1839, + "time_per_iteration": 2.7678062915802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092399, + "balance_loss_mlp": 1.06307316, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.057234368489623245, + "language_loss": 0.82641804, + "learning_rate": 0.000749100116249692, + "loss": 0.83734202, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.29272461, + "step": 1840, + "time_per_iteration": 2.6124982833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_mlp": 1.0616498, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.09225915028059628, + "language_loss": 0.86273944, + "learning_rate": 0.0007488299414112321, + "loss": 0.87365901, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.30249023, + "step": 1841, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087223, + "balance_loss_mlp": 1.05737281, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.0557731038759208, + "language_loss": 0.77796137, + "learning_rate": 0.0007485596699684215, + "loss": 0.78883362, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.2980957, + "step": 1842, + "time_per_iteration": 2.83414626121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_mlp": 1.05561948, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.04938820360777142, + "language_loss": 0.85113978, + "learning_rate": 0.000748289302026189, + "loss": 0.86201257, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.31640625, + "step": 1843, + "time_per_iteration": 2.8805251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_mlp": 1.05403841, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06499404847276229, + "language_loss": 0.85830677, + "learning_rate": 0.0007480188376895004, + "loss": 0.86915159, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.30395508, + "step": 1844, + "time_per_iteration": 3.0965142250061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_mlp": 1.04624832, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.026974392702602535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74874085, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.16503906, + "step": 1845, + "time_per_iteration": 5.003226280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.05738342, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.11496133406812095, + "language_loss": 0.78570682, + "learning_rate": 0.0007474776202528074, + "loss": 0.79659295, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.31201172, + "step": 1846, + "time_per_iteration": 2.9579098224639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089072, + "balance_loss_mlp": 1.05736208, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.06294098896241457, + "language_loss": 0.81369591, + "learning_rate": 0.000747206867362922, + "loss": 0.82458663, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.31689453, + "step": 1847, + "time_per_iteration": 3.0886905193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109789, + "balance_loss_mlp": 1.06656218, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.060378794046525276, + "language_loss": 0.83593512, + "learning_rate": 0.0007469360184988194, + "loss": 0.84691405, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.31298828, + "step": 1848, + "time_per_iteration": 2.861438512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109845, + "balance_loss_mlp": 1.06724131, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.06250375704468988, + "language_loss": 0.86663848, + "learning_rate": 0.0007466650737656518, + "loss": 0.87762296, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.31176758, + "step": 1849, + "time_per_iteration": 2.620384454727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098996, + "balance_loss_mlp": 1.06754851, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05619364173691644, + "language_loss": 0.90150386, + "learning_rate": 0.0007463940332686098, + "loss": 0.91249382, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.31420898, + "step": 1850, + "time_per_iteration": 2.499337911605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_mlp": 1.06711888, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.05220134930851383, + "language_loss": 0.8454684, + "learning_rate": 0.0007461228971129205, + "loss": 0.85644454, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.30444336, + "step": 1851, + "time_per_iteration": 2.91583251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_mlp": 1.06049538, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.06507053577711389, + "language_loss": 0.85374135, + "learning_rate": 0.0007458516654038483, + "loss": 0.8646493, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.30297852, + "step": 1852, + "time_per_iteration": 2.710845947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06221175, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.055267605083424515, + "language_loss": 0.86826843, + "learning_rate": 0.0007455803382466946, + "loss": 0.87919998, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.30908203, + "step": 1853, + "time_per_iteration": 2.8157601356506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089896, + "balance_loss_mlp": 1.05894923, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.06143674576014299, + "language_loss": 0.87150055, + "learning_rate": 0.0007453089157467979, + "loss": 0.8823995, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.30908203, + "step": 1854, + "time_per_iteration": 2.7985024452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_mlp": 1.06946826, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.06203911404438901, + "language_loss": 0.82222199, + "learning_rate": 0.0007450373980095341, + "loss": 0.83323234, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.31542969, + "step": 1855, + "time_per_iteration": 3.0960283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_mlp": 1.07108843, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.05169641299516589, + "language_loss": 0.86845142, + "learning_rate": 0.0007447657851403155, + "loss": 0.87946558, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.30322266, + "step": 1856, + "time_per_iteration": 2.6420810222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106839, + "balance_loss_mlp": 1.07689333, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.07027910399075639, + "language_loss": 0.78771162, + "learning_rate": 0.0007444940772445915, + "loss": 0.79878008, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.29907227, + "step": 1857, + "time_per_iteration": 2.748770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109389, + "balance_loss_mlp": 1.06420684, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.057407361829253975, + "language_loss": 0.80228555, + "learning_rate": 0.0007442222744278484, + "loss": 0.81322443, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.29663086, + "step": 1858, + "time_per_iteration": 2.652111530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094475, + "balance_loss_mlp": 1.06410074, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.045384089682170406, + "language_loss": 0.8399753, + "learning_rate": 0.0007439503767956099, + "loss": 0.85092002, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.30371094, + "step": 1859, + "time_per_iteration": 2.703261375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03111064, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.02493030642290896, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80715972, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.1328125, + "step": 1860, + "time_per_iteration": 4.983760833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092897, + "balance_loss_mlp": 1.06242704, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.05045998946960442, + "language_loss": 0.85959804, + "learning_rate": 0.000743406297506922, + "loss": 0.87052703, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.30419922, + "step": 1861, + "time_per_iteration": 2.740078926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090008, + "balance_loss_mlp": 1.05956221, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.05968554082553822, + "language_loss": 0.8392486, + "learning_rate": 0.0007431341160617031, + "loss": 0.85014868, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.30395508, + "step": 1862, + "time_per_iteration": 2.8886373043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076671, + "balance_loss_mlp": 1.04631984, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.053643840261235066, + "language_loss": 0.88015211, + "learning_rate": 0.0007428618402234491, + "loss": 0.89091879, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.30297852, + "step": 1863, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04334283, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.062332671108041963, + "language_loss": 0.80358481, + "learning_rate": 0.0007425894700978668, + "loss": 0.81432676, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.30810547, + "step": 1864, + "time_per_iteration": 2.7334656715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072556, + "balance_loss_mlp": 1.04101336, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.050645747658019255, + "language_loss": 0.79510379, + "learning_rate": 0.0007423170057906996, + "loss": 0.80582935, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.31542969, + "step": 1865, + "time_per_iteration": 3.8669073581695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076041, + "balance_loss_mlp": 1.04452205, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06345597879427126, + "language_loss": 0.86289865, + "learning_rate": 0.0007420444474077275, + "loss": 0.87365907, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.31518555, + "step": 1866, + "time_per_iteration": 2.5648367404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080689, + "balance_loss_mlp": 1.04878831, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.058480526362169126, + "language_loss": 0.89744091, + "learning_rate": 0.0007417717950547671, + "loss": 0.90824777, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.31884766, + "step": 1867, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074714, + "balance_loss_mlp": 1.0600276, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.04131149216661822, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77071321, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.14648438, + "step": 1868, + "time_per_iteration": 4.900072813034058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.06035757, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.04948067344873762, + "language_loss": 0.84714514, + "learning_rate": 0.0007412262088623299, + "loss": 0.85806173, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.31274414, + "step": 1869, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_mlp": 1.06255615, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.0631690153505957, + "language_loss": 0.79514921, + "learning_rate": 0.0007409532752346684, + "loss": 0.80607969, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.30444336, + "step": 1870, + "time_per_iteration": 2.646813154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05436683, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.05200384527654752, + "language_loss": 0.88430232, + "learning_rate": 0.0007406802480606491, + "loss": 0.89514613, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.29956055, + "step": 1871, + "time_per_iteration": 2.6335039138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088571, + "balance_loss_mlp": 1.05819631, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.058340376963862656, + "language_loss": 0.90469301, + "learning_rate": 0.0007404071274462707, + "loss": 0.91557872, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.3034668, + "step": 1872, + "time_per_iteration": 2.579155206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088392, + "balance_loss_mlp": 1.05911398, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06288764850432389, + "language_loss": 0.83945811, + "learning_rate": 0.0007401339134975682, + "loss": 0.85034204, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.29272461, + "step": 1873, + "time_per_iteration": 2.6590254306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089736, + "balance_loss_mlp": 1.06024313, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.07025897777145818, + "language_loss": 0.84501064, + "learning_rate": 0.0007398606063206122, + "loss": 0.85590804, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.29467773, + "step": 1874, + "time_per_iteration": 2.6330654621124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_mlp": 1.05545354, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05525815693458704, + "language_loss": 0.78668261, + "learning_rate": 0.0007395872060215101, + "loss": 0.79753017, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.29296875, + "step": 1875, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_mlp": 1.05853248, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.05566722247490556, + "language_loss": 0.88191175, + "learning_rate": 0.0007393137127064056, + "loss": 0.89278299, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.28588867, + "step": 1876, + "time_per_iteration": 2.67520809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_mlp": 1.05479455, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05183280051917729, + "language_loss": 0.84175742, + "learning_rate": 0.0007390401264814779, + "loss": 0.85258996, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.28491211, + "step": 1877, + "time_per_iteration": 2.621708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05559897, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.059598774698536174, + "language_loss": 0.84762645, + "learning_rate": 0.0007387664474529427, + "loss": 0.85846466, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.28222656, + "step": 1878, + "time_per_iteration": 2.64604115486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_mlp": 1.0567776, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.05278661870548292, + "language_loss": 0.90893793, + "learning_rate": 0.0007384926757270518, + "loss": 0.91979533, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.28955078, + "step": 1879, + "time_per_iteration": 2.63849139213562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094605, + "balance_loss_mlp": 1.0652554, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.05095981973878578, + "language_loss": 0.79965544, + "learning_rate": 0.0007382188114100924, + "loss": 0.81060153, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.29296875, + "step": 1880, + "time_per_iteration": 2.967137098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_mlp": 1.06731534, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.0523610100033388, + "language_loss": 0.81541228, + "learning_rate": 0.0007379448546083884, + "loss": 0.82638228, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.29663086, + "step": 1881, + "time_per_iteration": 2.935075283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089574, + "balance_loss_mlp": 1.06036723, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.056326792126263736, + "language_loss": 0.88131809, + "learning_rate": 0.0007376708054282992, + "loss": 0.89221382, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.29174805, + "step": 1882, + "time_per_iteration": 2.9548256397247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080549, + "balance_loss_mlp": 1.05074644, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.053377968629185854, + "language_loss": 0.8395232, + "learning_rate": 0.0007373966639762201, + "loss": 0.85032874, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.29785156, + "step": 1883, + "time_per_iteration": 2.5978147983551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_mlp": 1.05085516, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.055969169447774005, + "language_loss": 0.88542271, + "learning_rate": 0.0007371224303585822, + "loss": 0.8962214, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.29003906, + "step": 1884, + "time_per_iteration": 2.573521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122192, + "balance_loss_mlp": 1.10817313, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.05390094690370155, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81479263, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.140625, + "step": 1885, + "time_per_iteration": 4.762617826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077599, + "balance_loss_mlp": 1.04722452, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05279204841925659, + "language_loss": 0.8277564, + "learning_rate": 0.0007365736870525335, + "loss": 0.83853239, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.30322266, + "step": 1886, + "time_per_iteration": 2.8206799030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071958, + "balance_loss_mlp": 1.04182231, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.0631822735743998, + "language_loss": 0.82252121, + "learning_rate": 0.000736299177577164, + "loss": 0.83324087, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.30102539, + "step": 1887, + "time_per_iteration": 2.5644423961639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075611, + "balance_loss_mlp": 1.04516482, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.06952119877485304, + "language_loss": 0.83928037, + "learning_rate": 0.0007360245763623174, + "loss": 0.8500365, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.30395508, + "step": 1888, + "time_per_iteration": 2.68868088722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_mlp": 1.04614949, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.05500458280543127, + "language_loss": 0.89759338, + "learning_rate": 0.0007357498835146039, + "loss": 0.90835977, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.30444336, + "step": 1889, + "time_per_iteration": 2.841135263442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078037, + "balance_loss_mlp": 1.04716182, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.05518095134274227, + "language_loss": 0.86945391, + "learning_rate": 0.0007354750991406684, + "loss": 0.8802343, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.30834961, + "step": 1890, + "time_per_iteration": 2.6954762935638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04810333, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.060964398763012274, + "language_loss": 0.80524838, + "learning_rate": 0.0007352002233471919, + "loss": 0.81604487, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.31518555, + "step": 1891, + "time_per_iteration": 2.6167404651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04973292, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.06807309201777603, + "language_loss": 0.79092562, + "learning_rate": 0.0007349252562408906, + "loss": 0.80172026, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.296875, + "step": 1892, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091379, + "balance_loss_mlp": 1.06071806, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.05563142804906438, + "language_loss": 0.81399196, + "learning_rate": 0.0007346501979285158, + "loss": 0.82490575, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.30615234, + "step": 1893, + "time_per_iteration": 2.8852903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_mlp": 1.06208813, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02944776437417564, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8161397, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.12792969, + "step": 1894, + "time_per_iteration": 4.784174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114227, + "balance_loss_mlp": 1.0819447, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.051755500006301046, + "language_loss": 0.8558799, + "learning_rate": 0.0007340998081127308, + "loss": 0.86702216, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.32275391, + "step": 1895, + "time_per_iteration": 2.807494878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121943, + "balance_loss_mlp": 1.09023345, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.06567695066031824, + "language_loss": 0.90748346, + "learning_rate": 0.0007338244768230007, + "loss": 0.9187029, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.31689453, + "step": 1896, + "time_per_iteration": 2.7678794860839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118221, + "balance_loss_mlp": 1.08694077, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.07782470610585689, + "language_loss": 0.8913762, + "learning_rate": 0.0007335490547545578, + "loss": 0.90255845, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.3125, + "step": 1897, + "time_per_iteration": 3.0801138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112607, + "balance_loss_mlp": 1.0822562, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.05264242736204855, + "language_loss": 0.82653165, + "learning_rate": 0.0007332735420143308, + "loss": 0.83765769, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.30297852, + "step": 1898, + "time_per_iteration": 2.7581489086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094572, + "balance_loss_mlp": 1.06338716, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.06387883695900265, + "language_loss": 0.8681283, + "learning_rate": 0.0007329979387092826, + "loss": 0.87907398, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.31152344, + "step": 1899, + "time_per_iteration": 2.586489677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.05964673, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.054083416077733606, + "language_loss": 0.83626556, + "learning_rate": 0.0007327222449464124, + "loss": 0.84716845, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.3059082, + "step": 1900, + "time_per_iteration": 3.2495076656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_mlp": 1.0518986, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.05500564094416643, + "language_loss": 0.88598847, + "learning_rate": 0.0007324464608327538, + "loss": 0.89683151, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.32397461, + "step": 1901, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079363, + "balance_loss_mlp": 1.04786777, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.0538418205513684, + "language_loss": 0.88291639, + "learning_rate": 0.0007321705864753758, + "loss": 0.89371002, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.31469727, + "step": 1902, + "time_per_iteration": 2.69343638420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04294717, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.056477009868628435, + "language_loss": 0.84098166, + "learning_rate": 0.0007318946219813823, + "loss": 0.85172582, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.31469727, + "step": 1903, + "time_per_iteration": 3.010847568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04232407, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05768945263904951, + "language_loss": 0.89714533, + "learning_rate": 0.000731618567457912, + "loss": 0.90789449, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.32592773, + "step": 1904, + "time_per_iteration": 2.6410703659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076588, + "balance_loss_mlp": 1.0440681, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05570087619571841, + "language_loss": 0.86445332, + "learning_rate": 0.000731342423012139, + "loss": 0.87521917, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.32519531, + "step": 1905, + "time_per_iteration": 3.054703712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.04312992, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.05663901457074664, + "language_loss": 0.82393479, + "learning_rate": 0.0007310661887512722, + "loss": 0.83468342, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.31713867, + "step": 1906, + "time_per_iteration": 3.0096654891967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076944, + "balance_loss_mlp": 1.04532969, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.07427377535541638, + "language_loss": 0.8207258, + "learning_rate": 0.0007307898647825549, + "loss": 0.83149529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.31591797, + "step": 1907, + "time_per_iteration": 2.67525315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04347432, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.07021562329929035, + "language_loss": 0.89152002, + "learning_rate": 0.0007305134512132659, + "loss": 0.90227735, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.32250977, + "step": 1908, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0476923, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.07878350898766671, + "language_loss": 0.83255082, + "learning_rate": 0.0007302369481507183, + "loss": 0.84334129, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.31323242, + "step": 1909, + "time_per_iteration": 2.5106606483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108859, + "balance_loss_mlp": 1.09207463, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.039316944601114644, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.8107062, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.16796875, + "step": 1910, + "time_per_iteration": 4.845642566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_mlp": 1.04287899, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.05282525969479425, + "language_loss": 0.8551507, + "learning_rate": 0.000729683673975274, + "loss": 0.86588871, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.30883789, + "step": 1911, + "time_per_iteration": 2.643991470336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077837, + "balance_loss_mlp": 1.04648542, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.06579029503933971, + "language_loss": 0.83071077, + "learning_rate": 0.0007294069030771774, + "loss": 0.84148908, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.31323242, + "step": 1912, + "time_per_iteration": 3.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081127, + "balance_loss_mlp": 1.05053759, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055639286508135585, + "language_loss": 0.90529931, + "learning_rate": 0.0007291300431154224, + "loss": 0.91611063, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.30541992, + "step": 1913, + "time_per_iteration": 2.6364145278930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020102, + "balance_loss_mlp": 1.00503433, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.014819520409209537, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71409839, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.15039062, + "step": 1914, + "time_per_iteration": 4.986552000045776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089166, + "balance_loss_mlp": 1.05895889, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.07166131614104637, + "language_loss": 0.80129957, + "learning_rate": 0.0007285760564309179, + "loss": 0.81219125, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.30151367, + "step": 1915, + "time_per_iteration": 3.105180025100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.05362058, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.07315246202889085, + "language_loss": 0.85023272, + "learning_rate": 0.0007282989299232448, + "loss": 0.86106199, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.29272461, + "step": 1916, + "time_per_iteration": 3.0501549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_mlp": 1.05710506, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.0682472178493412, + "language_loss": 0.83468378, + "learning_rate": 0.0007280217147820668, + "loss": 0.84554267, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.28735352, + "step": 1917, + "time_per_iteration": 2.61570143699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.06836295, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.06368361877082852, + "language_loss": 0.79183483, + "learning_rate": 0.0007277444111150079, + "loss": 0.80280429, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.28613281, + "step": 1918, + "time_per_iteration": 2.7004950046539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_mlp": 1.06124449, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.07280537378335762, + "language_loss": 0.84052753, + "learning_rate": 0.0007274670190297272, + "loss": 0.85142708, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.28710938, + "step": 1919, + "time_per_iteration": 2.598128080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06902122, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.05243134255501039, + "language_loss": 0.82081646, + "learning_rate": 0.0007271895386339179, + "loss": 0.83180475, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.29736328, + "step": 1920, + "time_per_iteration": 2.7843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093148, + "balance_loss_mlp": 1.06360769, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.058714378397154585, + "language_loss": 0.83102447, + "learning_rate": 0.0007269119700353073, + "loss": 0.8419559, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.29492188, + "step": 1921, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_mlp": 1.06052053, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04695414461356542, + "language_loss": 0.84780574, + "learning_rate": 0.0007266343133416571, + "loss": 0.85869944, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.28833008, + "step": 1922, + "time_per_iteration": 2.779585361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065569, + "balance_loss_mlp": 1.05011928, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.04139595668748732, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78182483, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.15429688, + "step": 1923, + "time_per_iteration": 4.841213703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_mlp": 1.05591547, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.07673769099321799, + "language_loss": 0.84293365, + "learning_rate": 0.0007260787361004556, + "loss": 0.85378897, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.2956543, + "step": 1924, + "time_per_iteration": 2.5501017570495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_mlp": 1.00875258, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.01226438472350035, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74784565, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.14257812, + "step": 1925, + "time_per_iteration": 4.9058191776275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05040073, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.0733591012555623, + "language_loss": 0.87266588, + "learning_rate": 0.0007255228077730903, + "loss": 0.88345671, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.28686523, + "step": 1926, + "time_per_iteration": 2.6776785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080805, + "balance_loss_mlp": 1.05281413, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.05143591599053885, + "language_loss": 0.81313562, + "learning_rate": 0.0007252447122218632, + "loss": 0.82394373, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.2800293, + "step": 1927, + "time_per_iteration": 3.1710472106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_mlp": 1.04907489, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.07597924069729044, + "language_loss": 0.88653511, + "learning_rate": 0.0007249665292228834, + "loss": 0.89731288, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.28686523, + "step": 1928, + "time_per_iteration": 2.580092191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108352, + "balance_loss_mlp": 1.0547905, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.05796370091963761, + "language_loss": 0.8379482, + "learning_rate": 0.000724688258884151, + "loss": 0.84878337, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.28710938, + "step": 1929, + "time_per_iteration": 2.6322267055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_mlp": 1.05740142, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.049384577339976525, + "language_loss": 0.86327779, + "learning_rate": 0.0007244099013137002, + "loss": 0.87413883, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.28710938, + "step": 1930, + "time_per_iteration": 3.09224009513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_mlp": 1.05951214, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.06129670734370297, + "language_loss": 0.88767004, + "learning_rate": 0.0007241314566195993, + "loss": 0.89854914, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.28393555, + "step": 1931, + "time_per_iteration": 3.238381862640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094186, + "balance_loss_mlp": 1.06531322, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.05545779345638414, + "language_loss": 0.85434037, + "learning_rate": 0.0007238529249099496, + "loss": 0.86528224, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.28833008, + "step": 1932, + "time_per_iteration": 2.632279872894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159138, + "balance_loss_mlp": 1.1475507, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.054961579821259376, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79016018, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.11572266, + "step": 1933, + "time_per_iteration": 4.920037746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098131, + "balance_loss_mlp": 1.06902027, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.06411393233522368, + "language_loss": 0.80432916, + "learning_rate": 0.000723295600876581, + "loss": 0.81531054, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.29101562, + "step": 1934, + "time_per_iteration": 3.060438632965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_mlp": 1.06510615, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.054125512250282885, + "language_loss": 0.87856102, + "learning_rate": 0.0007230168087692344, + "loss": 0.88949579, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.28393555, + "step": 1935, + "time_per_iteration": 2.655176877975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095042, + "balance_loss_mlp": 1.06607461, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.053712544631880174, + "language_loss": 0.82501912, + "learning_rate": 0.0007227379300790839, + "loss": 0.83596957, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.28955078, + "step": 1936, + "time_per_iteration": 3.05722713470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_mlp": 1.05668318, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.05452705072121448, + "language_loss": 0.85148442, + "learning_rate": 0.0007224589649143997, + "loss": 0.86234665, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.29492188, + "step": 1937, + "time_per_iteration": 2.593818187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06021869, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08689315573767935, + "language_loss": 0.80660325, + "learning_rate": 0.0007221799133834861, + "loss": 0.81749392, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.28833008, + "step": 1938, + "time_per_iteration": 2.6238772869110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087089, + "balance_loss_mlp": 1.05869377, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.06550449761554421, + "language_loss": 0.81904262, + "learning_rate": 0.00072190077559468, + "loss": 0.8299135, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.28417969, + "step": 1939, + "time_per_iteration": 2.5338878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_mlp": 1.05649543, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.05171807924061888, + "language_loss": 0.89000612, + "learning_rate": 0.0007216215516563527, + "loss": 0.90086764, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.29589844, + "step": 1940, + "time_per_iteration": 2.717912435531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_mlp": 1.05449796, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.06398735943962416, + "language_loss": 0.83462608, + "learning_rate": 0.0007213422416769083, + "loss": 0.84545934, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.28808594, + "step": 1941, + "time_per_iteration": 2.6354072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107949, + "balance_loss_mlp": 1.0511179, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05310409823342424, + "language_loss": 0.75118601, + "learning_rate": 0.0007210628457647849, + "loss": 0.76198089, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.28369141, + "step": 1942, + "time_per_iteration": 2.573251724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080746, + "balance_loss_mlp": 1.05118251, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.05561530112530558, + "language_loss": 0.78689432, + "learning_rate": 0.000720783364028453, + "loss": 0.79770184, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.29516602, + "step": 1943, + "time_per_iteration": 2.782897472381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078848, + "balance_loss_mlp": 1.04935515, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05583674557333592, + "language_loss": 0.87426305, + "learning_rate": 0.0007205037965764177, + "loss": 0.88505149, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.29467773, + "step": 1944, + "time_per_iteration": 2.577195167541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_mlp": 1.04740369, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05970518460248593, + "language_loss": 0.8568424, + "learning_rate": 0.0007202241435172161, + "loss": 0.86760962, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.29296875, + "step": 1945, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04849827, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.057784843601785166, + "language_loss": 0.88219595, + "learning_rate": 0.0007199444049594198, + "loss": 0.89296943, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.28833008, + "step": 1946, + "time_per_iteration": 2.997744560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075997, + "balance_loss_mlp": 1.04681468, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.05996621635377081, + "language_loss": 0.83343232, + "learning_rate": 0.0007196645810116322, + "loss": 0.84419227, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.29150391, + "step": 1947, + "time_per_iteration": 2.6596434116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071198, + "balance_loss_mlp": 1.04308891, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.07792528533349045, + "language_loss": 0.8387686, + "learning_rate": 0.0007193846717824912, + "loss": 0.84948057, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.28149414, + "step": 1948, + "time_per_iteration": 2.87357759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04031014, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06284621907245236, + "language_loss": 0.88014293, + "learning_rate": 0.0007191046773806669, + "loss": 0.89082038, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.27514648, + "step": 1949, + "time_per_iteration": 2.616118907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073776, + "balance_loss_mlp": 1.04473686, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06080214721481266, + "language_loss": 0.83072305, + "learning_rate": 0.0007188245979148631, + "loss": 0.84146082, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.29003906, + "step": 1950, + "time_per_iteration": 3.212918281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05164886, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.06034460157863772, + "language_loss": 0.87560785, + "learning_rate": 0.0007185444334938157, + "loss": 0.88641185, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.28735352, + "step": 1951, + "time_per_iteration": 2.6847927570343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_mlp": 1.04635811, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.07362347851216991, + "language_loss": 0.85023165, + "learning_rate": 0.0007182641842262947, + "loss": 0.86097872, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.28320312, + "step": 1952, + "time_per_iteration": 2.6011481285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080682, + "balance_loss_mlp": 1.05252457, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.05143100601063952, + "language_loss": 0.77525514, + "learning_rate": 0.0007179838502211022, + "loss": 0.78606194, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.28198242, + "step": 1953, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.05487227, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.06528688845841664, + "language_loss": 0.86487108, + "learning_rate": 0.0007177034315870738, + "loss": 0.87569952, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.27978516, + "step": 1954, + "time_per_iteration": 2.9551377296447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04896057, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.059767476828271, + "language_loss": 0.90968794, + "learning_rate": 0.0007174229284330773, + "loss": 0.9204582, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.28076172, + "step": 1955, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.0481143, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.06317358450106399, + "language_loss": 0.87043428, + "learning_rate": 0.0007171423408680141, + "loss": 0.88119459, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.27954102, + "step": 1956, + "time_per_iteration": 2.8243377208709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.04352272, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.057758823731725896, + "language_loss": 0.89565909, + "learning_rate": 0.0007168616690008176, + "loss": 0.90638542, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.29125977, + "step": 1957, + "time_per_iteration": 2.6314306259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_mlp": 1.04572916, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.055146864479517985, + "language_loss": 0.86279052, + "learning_rate": 0.0007165809129404545, + "loss": 0.87353098, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.28320312, + "step": 1958, + "time_per_iteration": 2.7625439167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074993, + "balance_loss_mlp": 1.044595, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.06141204693847206, + "language_loss": 0.85977095, + "learning_rate": 0.0007163000727959239, + "loss": 0.87052089, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.30371094, + "step": 1959, + "time_per_iteration": 2.473407506942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061387, + "balance_loss_mlp": 1.04622388, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.02935416999593297, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79020452, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.15136719, + "step": 1960, + "time_per_iteration": 4.8784215450286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_mlp": 1.04973722, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.05722982355969982, + "language_loss": 0.84446192, + "learning_rate": 0.00071573814069052, + "loss": 0.85525477, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.29541016, + "step": 1961, + "time_per_iteration": 2.929955244064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_mlp": 1.05031538, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.053564242831421076, + "language_loss": 0.88053226, + "learning_rate": 0.0007154570489478081, + "loss": 0.8913213, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.28540039, + "step": 1962, + "time_per_iteration": 3.1691505908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079242, + "balance_loss_mlp": 1.05001187, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.05213464978332433, + "language_loss": 0.86570239, + "learning_rate": 0.0007151758735572514, + "loss": 0.87649477, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.29174805, + "step": 1963, + "time_per_iteration": 2.9893381595611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_mlp": 1.05190408, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06256473208381459, + "language_loss": 0.80730724, + "learning_rate": 0.0007148946146280119, + "loss": 0.81811094, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.28442383, + "step": 1964, + "time_per_iteration": 2.8270015716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_mlp": 1.00214851, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.01808471901321765, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73207271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12988281, + "step": 1965, + "time_per_iteration": 4.895836353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018206, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.021930840707602553, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76360154, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.12597656, + "step": 1966, + "time_per_iteration": 5.0023956298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091314, + "balance_loss_mlp": 1.06358576, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.04479252262380658, + "language_loss": 0.83477217, + "learning_rate": 0.0007140503377003022, + "loss": 0.84568524, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.27734375, + "step": 1967, + "time_per_iteration": 3.0142691135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097939, + "balance_loss_mlp": 1.07011509, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.049620821678558774, + "language_loss": 0.8500334, + "learning_rate": 0.000713768745708599, + "loss": 0.86101276, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.27856445, + "step": 1968, + "time_per_iteration": 2.6556408405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109518, + "balance_loss_mlp": 1.06807137, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.05249502952466034, + "language_loss": 0.7739228, + "learning_rate": 0.0007134870707245085, + "loss": 0.78487462, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.27148438, + "step": 1969, + "time_per_iteration": 3.2944319248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097317, + "balance_loss_mlp": 1.0706377, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06611086672726225, + "language_loss": 0.84358507, + "learning_rate": 0.0007132053128573864, + "loss": 0.85455823, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.26733398, + "step": 1970, + "time_per_iteration": 2.745910167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.07422984, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.07389156257299019, + "language_loss": 0.83986598, + "learning_rate": 0.0007129234722166211, + "loss": 0.8508774, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.26977539, + "step": 1971, + "time_per_iteration": 2.8552701473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095612, + "balance_loss_mlp": 1.06881404, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.0464186232668544, + "language_loss": 0.90731955, + "learning_rate": 0.0007126415489116328, + "loss": 0.91827571, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.26818848, + "step": 1972, + "time_per_iteration": 2.6738507747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089531, + "balance_loss_mlp": 1.06185079, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05397666452651625, + "language_loss": 0.81034803, + "learning_rate": 0.0007123595430518736, + "loss": 0.82124341, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.27685547, + "step": 1973, + "time_per_iteration": 2.8551318645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_mlp": 1.06225908, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07183677804285386, + "language_loss": 0.86159599, + "learning_rate": 0.0007120774547468282, + "loss": 0.87249249, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.27416992, + "step": 1974, + "time_per_iteration": 2.5466248989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091836, + "balance_loss_mlp": 1.06477594, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.057862181788604236, + "language_loss": 0.81643212, + "learning_rate": 0.0007117952841060128, + "loss": 0.82735044, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.27099609, + "step": 1975, + "time_per_iteration": 2.6863863468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010857, + "balance_loss_mlp": 1.05813885, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.06251241790432795, + "language_loss": 0.83861643, + "learning_rate": 0.0007115130312389756, + "loss": 0.84947342, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.27587891, + "step": 1976, + "time_per_iteration": 2.6821115016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088536, + "balance_loss_mlp": 1.0602119, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.063889045898505, + "language_loss": 0.79037011, + "learning_rate": 0.0007112306962552973, + "loss": 0.80125546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.28320312, + "step": 1977, + "time_per_iteration": 2.5958874225616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05877423, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055122671956433805, + "language_loss": 0.85178941, + "learning_rate": 0.0007109482792645896, + "loss": 0.8626554, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.27832031, + "step": 1978, + "time_per_iteration": 2.706073760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081892, + "balance_loss_mlp": 1.05363917, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06407360303991923, + "language_loss": 0.83617824, + "learning_rate": 0.0007106657803764969, + "loss": 0.84699714, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.2824707, + "step": 1979, + "time_per_iteration": 2.7429239749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078619, + "balance_loss_mlp": 1.05022287, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.07177583644367627, + "language_loss": 0.8165133, + "learning_rate": 0.0007103831997006948, + "loss": 0.82729954, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.28393555, + "step": 1980, + "time_per_iteration": 2.7360527515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072489, + "balance_loss_mlp": 1.04361689, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.06360208542685557, + "language_loss": 0.85186386, + "learning_rate": 0.0007101005373468908, + "loss": 0.86258882, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.28833008, + "step": 1981, + "time_per_iteration": 2.925529718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03775024, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.051682910059599525, + "language_loss": 0.86574209, + "learning_rate": 0.0007098177934248242, + "loss": 0.87640351, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.28369141, + "step": 1982, + "time_per_iteration": 2.7813186645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_mlp": 1.03770101, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.06153978169673806, + "language_loss": 0.85434651, + "learning_rate": 0.0007095349680442661, + "loss": 0.86501151, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.2878418, + "step": 1983, + "time_per_iteration": 2.878678321838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.04062414, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.05550499316869274, + "language_loss": 0.78828371, + "learning_rate": 0.0007092520613150188, + "loss": 0.79897726, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.28710938, + "step": 1984, + "time_per_iteration": 2.667602300643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04057729, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.04940974411679134, + "language_loss": 0.81105816, + "learning_rate": 0.0007089690733469165, + "loss": 0.82175809, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.29394531, + "step": 1985, + "time_per_iteration": 2.7445921897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077693, + "balance_loss_mlp": 1.04924965, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.0710841944315155, + "language_loss": 0.82154202, + "learning_rate": 0.000708686004249825, + "loss": 0.8323189, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.28442383, + "step": 1986, + "time_per_iteration": 2.803262948989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_mlp": 1.0459218, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053095768122865476, + "language_loss": 0.91283715, + "learning_rate": 0.0007084028541336413, + "loss": 0.92359161, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.29467773, + "step": 1987, + "time_per_iteration": 2.693894147872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_mlp": 1.04807711, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.04978295407195845, + "language_loss": 0.86100876, + "learning_rate": 0.0007081196231082942, + "loss": 0.87176782, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.27807617, + "step": 1988, + "time_per_iteration": 2.8127198219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05097318, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05417702481979702, + "language_loss": 0.80060172, + "learning_rate": 0.0007078363112837436, + "loss": 0.81139255, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.28125, + "step": 1989, + "time_per_iteration": 2.8839027881622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.04866838, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.05590772319077314, + "language_loss": 0.84895635, + "learning_rate": 0.000707552918769981, + "loss": 0.85972643, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.28344727, + "step": 1990, + "time_per_iteration": 2.4921815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075886, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.05219115858491499, + "language_loss": 0.8389315, + "learning_rate": 0.000707269445677029, + "loss": 0.84969032, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.27563477, + "step": 1991, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_mlp": 1.05205727, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.061454112768806295, + "language_loss": 0.85369635, + "learning_rate": 0.0007069858921149416, + "loss": 0.8645004, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.28344727, + "step": 1992, + "time_per_iteration": 2.953749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077015, + "balance_loss_mlp": 1.04919195, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.04324001999537677, + "language_loss": 0.86024761, + "learning_rate": 0.0007067022581938043, + "loss": 0.87101781, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.27880859, + "step": 1993, + "time_per_iteration": 2.818094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072064, + "balance_loss_mlp": 1.04502726, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06003802076808944, + "language_loss": 0.83055973, + "learning_rate": 0.0007064185440237334, + "loss": 0.84128034, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.27075195, + "step": 1994, + "time_per_iteration": 2.7304775714874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.05043745, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.054248337050939024, + "language_loss": 0.84367561, + "learning_rate": 0.0007061347497148764, + "loss": 0.85445797, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.27807617, + "step": 1995, + "time_per_iteration": 2.747483015060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074409, + "balance_loss_mlp": 1.04706264, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06054830939074019, + "language_loss": 0.86660719, + "learning_rate": 0.0007058508753774122, + "loss": 0.87735128, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.27392578, + "step": 1996, + "time_per_iteration": 2.6960108280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078362, + "balance_loss_mlp": 1.05165958, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.05196412840141252, + "language_loss": 0.86974967, + "learning_rate": 0.0007055669211215505, + "loss": 0.88053334, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.26733398, + "step": 1997, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076337, + "balance_loss_mlp": 1.04775071, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06669720231739994, + "language_loss": 0.77213579, + "learning_rate": 0.0007052828870575322, + "loss": 0.78289914, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.28588867, + "step": 1998, + "time_per_iteration": 2.6813313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_mlp": 1.05808222, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.053007093293579055, + "language_loss": 0.8636111, + "learning_rate": 0.0007049987732956291, + "loss": 0.87446344, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.27197266, + "step": 1999, + "time_per_iteration": 2.9743165969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.04323626, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.046114011394728885, + "language_loss": 0.82846403, + "learning_rate": 0.0007047145799461439, + "loss": 0.83917749, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.28149414, + "step": 2000, + "time_per_iteration": 2.85295033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_mlp": 1.0488013, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.06118237782788499, + "language_loss": 0.8185212, + "learning_rate": 0.00070443030711941, + "loss": 0.82929248, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.28295898, + "step": 2001, + "time_per_iteration": 2.7602195739746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.04918385, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.06801983854699947, + "language_loss": 0.82348108, + "learning_rate": 0.0007041459549257924, + "loss": 0.83426422, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.29101562, + "step": 2002, + "time_per_iteration": 2.8562166690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.04565787, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.07124544558687326, + "language_loss": 0.7826004, + "learning_rate": 0.0007038615234756859, + "loss": 0.79334354, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.28662109, + "step": 2003, + "time_per_iteration": 3.1888484954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_mlp": 1.0429796, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.060193135665447615, + "language_loss": 0.83578098, + "learning_rate": 0.000703577012879517, + "loss": 0.8464973, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.28662109, + "step": 2004, + "time_per_iteration": 2.6438684463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069967, + "balance_loss_mlp": 1.04185688, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.05830751128665357, + "language_loss": 0.8852784, + "learning_rate": 0.0007032924232477423, + "loss": 0.89597809, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.28149414, + "step": 2005, + "time_per_iteration": 2.6632285118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071337, + "balance_loss_mlp": 1.04253602, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.05522600702951118, + "language_loss": 0.8025552, + "learning_rate": 0.0007030077546908493, + "loss": 0.81326854, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.28808594, + "step": 2006, + "time_per_iteration": 2.6748647689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06600749, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.04192005891791234, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84142971, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12255859, + "step": 2007, + "time_per_iteration": 4.758062124252319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084632, + "balance_loss_mlp": 1.05614078, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.06495221526254255, + "language_loss": 0.79320729, + "learning_rate": 0.0007024381812438117, + "loss": 0.80405354, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.28515625, + "step": 2008, + "time_per_iteration": 2.557239532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095356, + "balance_loss_mlp": 1.06607771, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.09570560546772983, + "language_loss": 0.83017313, + "learning_rate": 0.0007021532765747951, + "loss": 0.84112668, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.29248047, + "step": 2009, + "time_per_iteration": 2.984100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.06031561, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05400711762269546, + "language_loss": 0.78963518, + "learning_rate": 0.0007018682934229162, + "loss": 0.80052131, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.28295898, + "step": 2010, + "time_per_iteration": 2.9302892684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080883, + "balance_loss_mlp": 1.05220175, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05212566321061033, + "language_loss": 0.82523775, + "learning_rate": 0.0007015832318988152, + "loss": 0.83604658, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.28662109, + "step": 2011, + "time_per_iteration": 2.65934157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_mlp": 1.0158205, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.016832038405886617, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74917436, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11523438, + "step": 2012, + "time_per_iteration": 4.964378595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076687, + "balance_loss_mlp": 1.04776716, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.05730560331399072, + "language_loss": 0.83868068, + "learning_rate": 0.0007010128741766604, + "loss": 0.84944755, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.28857422, + "step": 2013, + "time_per_iteration": 2.7196977138519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_mlp": 1.04005277, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.0608937159393576, + "language_loss": 0.843593, + "learning_rate": 0.0007007275782000391, + "loss": 0.85428894, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.29492188, + "step": 2014, + "time_per_iteration": 2.635704517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.04351759, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.061731808628827385, + "language_loss": 0.84906852, + "learning_rate": 0.0007004422042940605, + "loss": 0.85979199, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.2878418, + "step": 2015, + "time_per_iteration": 2.500502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072405, + "balance_loss_mlp": 1.04246008, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.06410146749924231, + "language_loss": 0.89413089, + "learning_rate": 0.0007001567525695169, + "loss": 0.90485489, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.29931641, + "step": 2016, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072622, + "balance_loss_mlp": 1.04410672, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.057933083917186774, + "language_loss": 0.83612067, + "learning_rate": 0.0006998712231372303, + "loss": 0.84684694, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.28491211, + "step": 2017, + "time_per_iteration": 3.0175724029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04141831, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.04866320553491467, + "language_loss": 0.86211008, + "learning_rate": 0.0006995856161080532, + "loss": 0.87281585, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.29101562, + "step": 2018, + "time_per_iteration": 2.879014015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071313, + "balance_loss_mlp": 1.04193974, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.05910223086818918, + "language_loss": 0.81994784, + "learning_rate": 0.0006992999315928679, + "loss": 0.83066106, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.29345703, + "step": 2019, + "time_per_iteration": 2.794605255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078638, + "balance_loss_mlp": 1.04826391, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.0551019421553566, + "language_loss": 0.86098075, + "learning_rate": 0.0006990141697025871, + "loss": 0.8717671, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.3034668, + "step": 2020, + "time_per_iteration": 2.808492422103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_mlp": 1.04388523, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.03291843471702338, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77415681, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12158203, + "step": 2021, + "time_per_iteration": 4.747381687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04109025, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.0700535467402408, + "language_loss": 0.82436341, + "learning_rate": 0.0006984424142405392, + "loss": 0.83506376, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.28930664, + "step": 2022, + "time_per_iteration": 2.8081154823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070367, + "balance_loss_mlp": 1.04144704, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06604387927811756, + "language_loss": 0.81889653, + "learning_rate": 0.0006981564208907474, + "loss": 0.82960021, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.2890625, + "step": 2023, + "time_per_iteration": 2.615868091583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067731, + "balance_loss_mlp": 1.03947854, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.05337785231387105, + "language_loss": 0.90169919, + "learning_rate": 0.0006978703506098102, + "loss": 0.91237652, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.2824707, + "step": 2024, + "time_per_iteration": 2.7487242221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04292357, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.05102180718564601, + "language_loss": 0.87631416, + "learning_rate": 0.00069758420350879, + "loss": 0.88702166, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.27832031, + "step": 2025, + "time_per_iteration": 2.6278607845306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03802657, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.05496821729843788, + "language_loss": 0.85941356, + "learning_rate": 0.000697297979698779, + "loss": 0.87007421, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.28051758, + "step": 2026, + "time_per_iteration": 2.773711919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072256, + "balance_loss_mlp": 1.0449574, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.054849440695872026, + "language_loss": 0.83735013, + "learning_rate": 0.0006970116792908992, + "loss": 0.84807271, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.27368164, + "step": 2027, + "time_per_iteration": 3.1274263858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071715, + "balance_loss_mlp": 1.04348612, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.0501662810644282, + "language_loss": 0.80959415, + "learning_rate": 0.000696725302396302, + "loss": 0.82031131, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.28222656, + "step": 2028, + "time_per_iteration": 2.653289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078388, + "balance_loss_mlp": 1.050946, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.053195529027894116, + "language_loss": 0.85790342, + "learning_rate": 0.0006964388491261692, + "loss": 0.86868727, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.2746582, + "step": 2029, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082882, + "balance_loss_mlp": 1.0550828, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.06114884672927749, + "language_loss": 0.87352717, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435602, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.27832031, + "step": 2030, + "time_per_iteration": 2.8415944576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083514, + "balance_loss_mlp": 1.0548079, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.056999957489140544, + "language_loss": 0.78065526, + "learning_rate": 0.0006958657139041696, + "loss": 0.79149044, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.28686523, + "step": 2031, + "time_per_iteration": 2.750596761703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_mlp": 1.01660919, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.015090316928766313, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77740502, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.109375, + "step": 2032, + "time_per_iteration": 4.916932106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080774, + "balance_loss_mlp": 1.05371356, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.058882626995900515, + "language_loss": 0.77978921, + "learning_rate": 0.0006952922745149434, + "loss": 0.7905969, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.27099609, + "step": 2033, + "time_per_iteration": 2.6288254261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076329, + "balance_loss_mlp": 1.04802871, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.059683993490508125, + "language_loss": 0.8774389, + "learning_rate": 0.000695005441035888, + "loss": 0.88820225, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.28295898, + "step": 2034, + "time_per_iteration": 2.6451032161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021075, + "balance_loss_mlp": 1.01001287, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.012767183735830537, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74744511, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11083984, + "step": 2035, + "time_per_iteration": 4.875540018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05346835, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.05871453648610719, + "language_loss": 0.8120997, + "learning_rate": 0.0006944315470656863, + "loss": 0.82291067, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.27685547, + "step": 2036, + "time_per_iteration": 2.9991486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079422, + "balance_loss_mlp": 1.05193281, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05954449002694624, + "language_loss": 0.90806162, + "learning_rate": 0.000694144486797345, + "loss": 0.91885585, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.27539062, + "step": 2037, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016452, + "balance_loss_mlp": 1.00543678, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.010331538207496795, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80536884, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.11035156, + "step": 2038, + "time_per_iteration": 4.696615695953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077334, + "balance_loss_mlp": 1.04920101, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.05886678367995608, + "language_loss": 0.89078939, + "learning_rate": 0.0006935701402514156, + "loss": 0.90156269, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.28149414, + "step": 2039, + "time_per_iteration": 2.555340051651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00254571, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.009976601144167605, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74048454, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.11035156, + "step": 2040, + "time_per_iteration": 4.91499400138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04941869, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.0656092448350418, + "language_loss": 0.84421289, + "learning_rate": 0.0006929954931031422, + "loss": 0.8549906, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.28344727, + "step": 2041, + "time_per_iteration": 3.729060649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_mlp": 1.0521127, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05672023255092622, + "language_loss": 0.88579351, + "learning_rate": 0.0006927080570819805, + "loss": 0.8965857, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.27148438, + "step": 2042, + "time_per_iteration": 2.5964105129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05557048, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.07129276434353096, + "language_loss": 0.81115568, + "learning_rate": 0.0006924205462449161, + "loss": 0.82197881, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.26806641, + "step": 2043, + "time_per_iteration": 2.585873603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080679, + "balance_loss_mlp": 1.0537734, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.07610386660927036, + "language_loss": 0.8177464, + "learning_rate": 0.0006921329607035702, + "loss": 0.8285532, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.26940918, + "step": 2044, + "time_per_iteration": 3.238981246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087504, + "balance_loss_mlp": 1.0611347, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.0570655681013956, + "language_loss": 0.87757248, + "learning_rate": 0.0006918453005695938, + "loss": 0.88844752, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.26416016, + "step": 2045, + "time_per_iteration": 2.6602108478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_mlp": 1.06491971, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.055879562404771856, + "language_loss": 0.84307766, + "learning_rate": 0.0006915575659546662, + "loss": 0.85398793, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.26147461, + "step": 2046, + "time_per_iteration": 2.6592600345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_mlp": 1.06476951, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.06494345942268129, + "language_loss": 0.80426449, + "learning_rate": 0.0006912697569704959, + "loss": 0.81517833, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.26623535, + "step": 2047, + "time_per_iteration": 2.613070011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080678, + "balance_loss_mlp": 1.0539515, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.06871552578761372, + "language_loss": 0.86815077, + "learning_rate": 0.0006909818737288205, + "loss": 0.87895757, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.26745605, + "step": 2048, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05919969, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.055462609864315775, + "language_loss": 0.80754077, + "learning_rate": 0.000690693916341406, + "loss": 0.81840289, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.27075195, + "step": 2049, + "time_per_iteration": 2.668114185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_mlp": 1.0532347, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.05123788091691057, + "language_loss": 0.8241666, + "learning_rate": 0.0006904058849200475, + "loss": 0.83496863, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.27001953, + "step": 2050, + "time_per_iteration": 2.7161009311676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084281, + "balance_loss_mlp": 1.05679107, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.06391064418382593, + "language_loss": 0.84741384, + "learning_rate": 0.0006901177795765683, + "loss": 0.8582567, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.27514648, + "step": 2051, + "time_per_iteration": 2.6012356281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082278, + "balance_loss_mlp": 1.05540872, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.059538956745971455, + "language_loss": 0.8114661, + "learning_rate": 0.0006898296004228213, + "loss": 0.82228893, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.26879883, + "step": 2052, + "time_per_iteration": 2.739016056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091682, + "balance_loss_mlp": 1.07909358, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.0435951911950544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79218423, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12597656, + "step": 2053, + "time_per_iteration": 4.853093385696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.0498004, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.061585922129253, + "language_loss": 0.79790258, + "learning_rate": 0.0006892530211320763, + "loss": 0.80867237, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.2722168, + "step": 2054, + "time_per_iteration": 2.695810317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_mlp": 1.05135143, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06739666157176663, + "language_loss": 0.83483803, + "learning_rate": 0.000688964621218926, + "loss": 0.84561741, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.26611328, + "step": 2055, + "time_per_iteration": 2.5957767963409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04496288, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05900978816729325, + "language_loss": 0.79760778, + "learning_rate": 0.0006886761479432037, + "loss": 0.80831754, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.26037598, + "step": 2056, + "time_per_iteration": 2.823195457458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.0479672, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.06325658180551426, + "language_loss": 0.84495139, + "learning_rate": 0.0006883876014169045, + "loss": 0.85570216, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.27148438, + "step": 2057, + "time_per_iteration": 2.504899263381958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05080771, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05952155235087993, + "language_loss": 0.90666497, + "learning_rate": 0.000688098981752052, + "loss": 0.91744673, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.27441406, + "step": 2058, + "time_per_iteration": 2.705845832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079753, + "balance_loss_mlp": 1.05207229, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.057037005783434964, + "language_loss": 0.80068249, + "learning_rate": 0.0006878102890606982, + "loss": 0.81147999, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.27709961, + "step": 2059, + "time_per_iteration": 3.086745500564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108134, + "balance_loss_mlp": 1.0542556, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.07822530462482143, + "language_loss": 0.80866635, + "learning_rate": 0.0006875215234549239, + "loss": 0.8194797, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.27124023, + "step": 2060, + "time_per_iteration": 2.5814599990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080083, + "balance_loss_mlp": 1.05221188, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.06673254145899743, + "language_loss": 0.85142004, + "learning_rate": 0.0006872326850468376, + "loss": 0.86222088, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.27880859, + "step": 2061, + "time_per_iteration": 2.6693742275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081472, + "balance_loss_mlp": 1.05343366, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.06184749895138045, + "language_loss": 0.78875667, + "learning_rate": 0.0006869437739485762, + "loss": 0.79957139, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.28051758, + "step": 2062, + "time_per_iteration": 2.612020969390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108316, + "balance_loss_mlp": 1.05493176, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.07174128592683177, + "language_loss": 0.92295337, + "learning_rate": 0.0006866547902723053, + "loss": 0.93378496, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.2824707, + "step": 2063, + "time_per_iteration": 2.676013469696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_mlp": 1.05300224, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05898261192449876, + "language_loss": 0.80094039, + "learning_rate": 0.000686365734130218, + "loss": 0.81175387, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.28369141, + "step": 2064, + "time_per_iteration": 2.7021024227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071448, + "balance_loss_mlp": 1.0426228, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.09101918864834832, + "language_loss": 0.83948302, + "learning_rate": 0.000686076605634536, + "loss": 0.85019755, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.28808594, + "step": 2065, + "time_per_iteration": 2.6558356285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068247, + "balance_loss_mlp": 1.03963661, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.05840936356543045, + "language_loss": 0.83999312, + "learning_rate": 0.0006857874048975088, + "loss": 0.85067558, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.28613281, + "step": 2066, + "time_per_iteration": 2.556900978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068316, + "balance_loss_mlp": 1.04027796, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.07585091480167282, + "language_loss": 0.87176585, + "learning_rate": 0.0006854981320314142, + "loss": 0.88244903, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.28027344, + "step": 2067, + "time_per_iteration": 2.445798635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04426003, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.08763476788371415, + "language_loss": 0.86982906, + "learning_rate": 0.0006852087871485579, + "loss": 0.88055265, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.28125, + "step": 2068, + "time_per_iteration": 2.6390161514282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_mlp": 1.04861069, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.065510260101048, + "language_loss": 0.82088625, + "learning_rate": 0.0006849193703612735, + "loss": 0.83165061, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.27856445, + "step": 2069, + "time_per_iteration": 2.763023614883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_mlp": 1.04346275, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.058439166966186944, + "language_loss": 0.77565378, + "learning_rate": 0.0006846298817819225, + "loss": 0.78636372, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.27563477, + "step": 2070, + "time_per_iteration": 2.948054790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070331, + "balance_loss_mlp": 1.04296088, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.06370866866163034, + "language_loss": 0.80921137, + "learning_rate": 0.0006843403215228945, + "loss": 0.8199147, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.27392578, + "step": 2071, + "time_per_iteration": 2.440274953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075017, + "balance_loss_mlp": 1.04771829, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.05754797735781241, + "language_loss": 0.80491692, + "learning_rate": 0.0006840506896966065, + "loss": 0.81566709, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.2734375, + "step": 2072, + "time_per_iteration": 2.7141849994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076402, + "balance_loss_mlp": 1.04874492, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.06436648215160112, + "language_loss": 0.82351565, + "learning_rate": 0.0006837609864155038, + "loss": 0.83427966, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.27685547, + "step": 2073, + "time_per_iteration": 2.8728160858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107952, + "balance_loss_mlp": 1.05267441, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.06075069456973031, + "language_loss": 0.83255166, + "learning_rate": 0.0006834712117920592, + "loss": 0.84334683, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.26855469, + "step": 2074, + "time_per_iteration": 2.6078460216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081959, + "balance_loss_mlp": 1.05458879, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.08105254072349301, + "language_loss": 0.85028476, + "learning_rate": 0.0006831813659387729, + "loss": 0.86110437, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.27416992, + "step": 2075, + "time_per_iteration": 2.5435502529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080066, + "balance_loss_mlp": 1.05236197, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05543733258884828, + "language_loss": 0.84105802, + "learning_rate": 0.0006828914489681733, + "loss": 0.85185862, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.27758789, + "step": 2076, + "time_per_iteration": 2.716728687286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_mlp": 1.05186319, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05894989539880716, + "language_loss": 0.8515023, + "learning_rate": 0.0006826014609928162, + "loss": 0.86230129, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.28027344, + "step": 2077, + "time_per_iteration": 2.740797996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_mlp": 1.02490366, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.025465037646940157, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84235638, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.11328125, + "step": 2078, + "time_per_iteration": 4.832703590393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.05287147, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.11662193334808049, + "language_loss": 0.8017869, + "learning_rate": 0.0006820212724781896, + "loss": 0.81259406, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.27880859, + "step": 2079, + "time_per_iteration": 2.6742663383483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076717, + "balance_loss_mlp": 1.0488224, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.08177152300224107, + "language_loss": 0.83806193, + "learning_rate": 0.0006817310721641694, + "loss": 0.84882903, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.27905273, + "step": 2080, + "time_per_iteration": 2.8349008560180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076923, + "balance_loss_mlp": 1.04929078, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.06565277329590896, + "language_loss": 0.84214735, + "learning_rate": 0.00068144080129589, + "loss": 0.8529166, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.27685547, + "step": 2081, + "time_per_iteration": 2.6278159618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_mlp": 1.05710232, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05776018351639151, + "language_loss": 0.82856774, + "learning_rate": 0.0006811504599860441, + "loss": 0.83941126, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.27294922, + "step": 2082, + "time_per_iteration": 2.569265365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088899, + "balance_loss_mlp": 1.06140924, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.07401045054208001, + "language_loss": 0.85797036, + "learning_rate": 0.0006808600483473526, + "loss": 0.86885935, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.27490234, + "step": 2083, + "time_per_iteration": 2.8923354148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.05170512, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.06499053200862517, + "language_loss": 0.86023808, + "learning_rate": 0.0006805695664925629, + "loss": 0.87103558, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.28027344, + "step": 2084, + "time_per_iteration": 2.8025314807891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.05461943, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.06817943175075042, + "language_loss": 0.8386181, + "learning_rate": 0.0006802790145344506, + "loss": 0.84944773, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.28344727, + "step": 2085, + "time_per_iteration": 2.5035839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075393, + "balance_loss_mlp": 1.04725957, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.06401081868364573, + "language_loss": 0.87169802, + "learning_rate": 0.0006799883925858176, + "loss": 0.88245201, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.28125, + "step": 2086, + "time_per_iteration": 2.8827152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088527, + "balance_loss_mlp": 1.05989313, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06559731004413262, + "language_loss": 0.85316324, + "learning_rate": 0.0006796977007594933, + "loss": 0.86404848, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.28637695, + "step": 2087, + "time_per_iteration": 2.5959601402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094266, + "balance_loss_mlp": 1.06553721, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.12268552055269868, + "language_loss": 0.86342102, + "learning_rate": 0.0006794069391683345, + "loss": 0.87436372, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.28710938, + "step": 2088, + "time_per_iteration": 2.7393155097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089464, + "balance_loss_mlp": 1.06087732, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.0717880154934153, + "language_loss": 0.80560589, + "learning_rate": 0.0006791161079252248, + "loss": 0.81650054, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.28588867, + "step": 2089, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06879497, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.06954460778471602, + "language_loss": 0.8248291, + "learning_rate": 0.0006788252071430747, + "loss": 0.83581454, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.29711914, + "step": 2090, + "time_per_iteration": 2.682352304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_mlp": 1.07429934, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.07587120880411238, + "language_loss": 0.8680824, + "learning_rate": 0.0006785342369348222, + "loss": 0.87911433, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.28857422, + "step": 2091, + "time_per_iteration": 2.7333736419677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_mlp": 1.07579792, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.07069251800195664, + "language_loss": 0.7977879, + "learning_rate": 0.0006782431974134316, + "loss": 0.8088339, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.2878418, + "step": 2092, + "time_per_iteration": 2.541607141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105121, + "balance_loss_mlp": 1.0768441, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05426777537327344, + "language_loss": 0.89421535, + "learning_rate": 0.0006779520886918949, + "loss": 0.90526658, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.2824707, + "step": 2093, + "time_per_iteration": 3.035090684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_mlp": 1.07378376, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.07593649947233896, + "language_loss": 0.81461406, + "learning_rate": 0.0006776609108832301, + "loss": 0.82563823, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.28637695, + "step": 2094, + "time_per_iteration": 2.8035519123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102, + "balance_loss_mlp": 1.07398582, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.07164022458424311, + "language_loss": 0.85034972, + "learning_rate": 0.0006773696641004828, + "loss": 0.86136973, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.28027344, + "step": 2095, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.07147717, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.07309254376996902, + "language_loss": 0.77576917, + "learning_rate": 0.0006770783484567247, + "loss": 0.78676933, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.28515625, + "step": 2096, + "time_per_iteration": 3.1005897521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.06557441, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.04872529153034484, + "language_loss": 0.86118937, + "learning_rate": 0.000676786964065055, + "loss": 0.87212431, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.27978516, + "step": 2097, + "time_per_iteration": 2.78965163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093986, + "balance_loss_mlp": 1.06680584, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.06867709967223685, + "language_loss": 0.78839391, + "learning_rate": 0.0006764955110385986, + "loss": 0.79933375, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.2722168, + "step": 2098, + "time_per_iteration": 2.7579219341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.06361151, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.0577520756279271, + "language_loss": 0.80600876, + "learning_rate": 0.0006762039894905083, + "loss": 0.81691736, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.27294922, + "step": 2099, + "time_per_iteration": 2.632434129714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05595064, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06925599284799831, + "language_loss": 0.80233157, + "learning_rate": 0.000675912399533962, + "loss": 0.8131665, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.27563477, + "step": 2100, + "time_per_iteration": 2.521758556365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086411, + "balance_loss_mlp": 1.05947018, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.05734073179456058, + "language_loss": 0.84850854, + "learning_rate": 0.0006756207412821656, + "loss": 0.85937262, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.26977539, + "step": 2101, + "time_per_iteration": 3.043041944503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079398, + "balance_loss_mlp": 1.05245721, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.07220576126006613, + "language_loss": 0.80240154, + "learning_rate": 0.0006753290148483505, + "loss": 0.81319559, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.27001953, + "step": 2102, + "time_per_iteration": 3.0245606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_mlp": 1.05726886, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.06170005058098184, + "language_loss": 0.78875476, + "learning_rate": 0.0006750372203457752, + "loss": 0.79960519, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.27832031, + "step": 2103, + "time_per_iteration": 2.484698534011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078758, + "balance_loss_mlp": 1.05131626, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.05090920908511917, + "language_loss": 0.86534655, + "learning_rate": 0.0006747453578877242, + "loss": 0.87613416, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.27490234, + "step": 2104, + "time_per_iteration": 2.69670033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081019, + "balance_loss_mlp": 1.05281401, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.06546748387286302, + "language_loss": 0.8289392, + "learning_rate": 0.0006744534275875085, + "loss": 0.83974934, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.28222656, + "step": 2105, + "time_per_iteration": 2.9919168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.05620074, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.0635527467859112, + "language_loss": 0.8582921, + "learning_rate": 0.0006741614295584657, + "loss": 0.86912322, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.26977539, + "step": 2106, + "time_per_iteration": 2.6488401889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107849, + "balance_loss_mlp": 1.05073833, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.057690605181557136, + "language_loss": 0.78413224, + "learning_rate": 0.0006738693639139595, + "loss": 0.79491717, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.27807617, + "step": 2107, + "time_per_iteration": 2.9652647972106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078123, + "balance_loss_mlp": 1.05015635, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05945372540383898, + "language_loss": 0.77655667, + "learning_rate": 0.0006735772307673796, + "loss": 0.78733784, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.27978516, + "step": 2108, + "time_per_iteration": 3.5789337158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079955, + "balance_loss_mlp": 1.05222702, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.05752735064114104, + "language_loss": 0.83347392, + "learning_rate": 0.0006732850302321421, + "loss": 0.84427351, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.27783203, + "step": 2109, + "time_per_iteration": 2.869591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078846, + "balance_loss_mlp": 1.051476, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.06455621073123653, + "language_loss": 0.84327263, + "learning_rate": 0.00067299276242169, + "loss": 0.85406113, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.27441406, + "step": 2110, + "time_per_iteration": 2.673659563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.07071877, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.036236061846660186, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75464427, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.11523438, + "step": 2111, + "time_per_iteration": 4.886230230331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082274, + "balance_loss_mlp": 1.05490351, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05646906793429633, + "language_loss": 0.77664089, + "learning_rate": 0.0006724080254290395, + "loss": 0.78746361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.27416992, + "step": 2112, + "time_per_iteration": 2.8506221771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04847741, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.06356712121797842, + "language_loss": 0.89422435, + "learning_rate": 0.0006721155564738566, + "loss": 0.90498972, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.28100586, + "step": 2113, + "time_per_iteration": 2.673015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_mlp": 1.02626586, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.019828324636468348, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79660642, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.1171875, + "step": 2114, + "time_per_iteration": 5.003857851028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080097, + "balance_loss_mlp": 1.0521065, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07124796283110259, + "language_loss": 0.85397822, + "learning_rate": 0.0006715304182135078, + "loss": 0.86477917, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.2800293, + "step": 2115, + "time_per_iteration": 2.641721248626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.05418694, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.08996962933736626, + "language_loss": 0.88862896, + "learning_rate": 0.0006712377491355127, + "loss": 0.89945835, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.28735352, + "step": 2116, + "time_per_iteration": 2.880159616470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077208, + "balance_loss_mlp": 1.04857373, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.046629180459365246, + "language_loss": 0.81631374, + "learning_rate": 0.0006709450135771274, + "loss": 0.82708585, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.28637695, + "step": 2117, + "time_per_iteration": 2.9391822814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.04953849, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05926883506924263, + "language_loss": 0.86382973, + "learning_rate": 0.0006706522116520023, + "loss": 0.87459958, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.27490234, + "step": 2118, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_mlp": 1.05072808, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.06371775766221305, + "language_loss": 0.82902479, + "learning_rate": 0.0006703593434738127, + "loss": 0.83981442, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.28222656, + "step": 2119, + "time_per_iteration": 2.6982903480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080441, + "balance_loss_mlp": 1.05216455, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.05030428863920766, + "language_loss": 0.78137958, + "learning_rate": 0.0006700664091562604, + "loss": 0.792184, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.28271484, + "step": 2120, + "time_per_iteration": 2.5976343154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081224, + "balance_loss_mlp": 1.05259037, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.05481620044617693, + "language_loss": 0.85151196, + "learning_rate": 0.0006697734088130725, + "loss": 0.86232412, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.28637695, + "step": 2121, + "time_per_iteration": 2.613192558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_mlp": 1.05665159, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.0674188074849357, + "language_loss": 0.85445356, + "learning_rate": 0.0006694803425580018, + "loss": 0.86531019, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.28955078, + "step": 2122, + "time_per_iteration": 2.9808695316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_mlp": 1.05585766, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.06189748292204317, + "language_loss": 0.8466748, + "learning_rate": 0.0006691872105048268, + "loss": 0.85753286, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.29907227, + "step": 2123, + "time_per_iteration": 2.5712099075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_mlp": 1.05992901, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.06907127419859461, + "language_loss": 0.84616292, + "learning_rate": 0.0006688940127673513, + "loss": 0.85705543, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.29296875, + "step": 2124, + "time_per_iteration": 2.6865010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091737, + "balance_loss_mlp": 1.06181526, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.048409192362904495, + "language_loss": 0.85410631, + "learning_rate": 0.0006686007494594049, + "loss": 0.86502367, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.29882812, + "step": 2125, + "time_per_iteration": 2.8982856273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.06085694, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.07961338986962259, + "language_loss": 0.80014485, + "learning_rate": 0.0006683074206948425, + "loss": 0.81105095, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.29736328, + "step": 2126, + "time_per_iteration": 2.489884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086751, + "balance_loss_mlp": 1.05649602, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.06572114620312723, + "language_loss": 0.81335235, + "learning_rate": 0.0006680140265875443, + "loss": 0.82421982, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.30200195, + "step": 2127, + "time_per_iteration": 2.8000454902648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05512488, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.054748250322007024, + "language_loss": 0.95437354, + "learning_rate": 0.0006677205672514162, + "loss": 0.9652164, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.29125977, + "step": 2128, + "time_per_iteration": 2.6153228282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05600977, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.05206451104952603, + "language_loss": 0.88892365, + "learning_rate": 0.000667427042800389, + "loss": 0.89978707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.30273438, + "step": 2129, + "time_per_iteration": 2.772545337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080649, + "balance_loss_mlp": 1.0521338, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.06928662998118869, + "language_loss": 0.82843542, + "learning_rate": 0.0006671334533484192, + "loss": 0.83924192, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.28515625, + "step": 2130, + "time_per_iteration": 2.7501790523529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077969, + "balance_loss_mlp": 1.04938281, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.051614263088568736, + "language_loss": 0.83230782, + "learning_rate": 0.0006668397990094881, + "loss": 0.84308755, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.28613281, + "step": 2131, + "time_per_iteration": 2.7121975421905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083028, + "balance_loss_mlp": 1.05370235, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05828514658280376, + "language_loss": 0.84553468, + "learning_rate": 0.0006665460798976027, + "loss": 0.85636497, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.29296875, + "step": 2132, + "time_per_iteration": 2.7074639797210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082859, + "balance_loss_mlp": 1.05532122, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.06450815869750301, + "language_loss": 0.81324267, + "learning_rate": 0.0006662522961267947, + "loss": 0.82407123, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.27563477, + "step": 2133, + "time_per_iteration": 2.676886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.05555081, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.04843791936563358, + "language_loss": 0.87077558, + "learning_rate": 0.0006659584478111211, + "loss": 0.88161933, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.28833008, + "step": 2134, + "time_per_iteration": 2.8004117012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06910408, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.07835760686868988, + "language_loss": 0.82880664, + "learning_rate": 0.000665664535064664, + "loss": 0.83977091, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.2734375, + "step": 2135, + "time_per_iteration": 3.034134864807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_mlp": 1.07278681, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05799734322971953, + "language_loss": 0.82382762, + "learning_rate": 0.0006653705580015303, + "loss": 0.8348338, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.27819824, + "step": 2136, + "time_per_iteration": 2.719423770904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105373, + "balance_loss_mlp": 1.07747769, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.05212184008762054, + "language_loss": 0.863967, + "learning_rate": 0.0006650765167358523, + "loss": 0.87502074, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.27905273, + "step": 2137, + "time_per_iteration": 2.7973241806030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110879, + "balance_loss_mlp": 1.08089471, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.07588683613844963, + "language_loss": 0.89871359, + "learning_rate": 0.0006647824113817864, + "loss": 0.90980148, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.27929688, + "step": 2138, + "time_per_iteration": 2.520531177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114294, + "balance_loss_mlp": 1.08768606, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.055552110514209885, + "language_loss": 0.81525648, + "learning_rate": 0.000664488242053515, + "loss": 0.82639945, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.26660156, + "step": 2139, + "time_per_iteration": 2.7204349040985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099437, + "balance_loss_mlp": 1.0722574, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.05646005524415558, + "language_loss": 0.83858913, + "learning_rate": 0.0006641940088652445, + "loss": 0.84958351, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.27246094, + "step": 2140, + "time_per_iteration": 2.748011827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.07521284, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05970845599818087, + "language_loss": 0.81979877, + "learning_rate": 0.0006638997119312065, + "loss": 0.83081794, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.26757812, + "step": 2141, + "time_per_iteration": 2.723269462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.07826746, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.04300629071925061, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76154923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.13378906, + "step": 2142, + "time_per_iteration": 4.922248363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089912, + "balance_loss_mlp": 1.06239891, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06629114096949819, + "language_loss": 0.8462221, + "learning_rate": 0.000663310927282877, + "loss": 0.85712123, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.27563477, + "step": 2143, + "time_per_iteration": 2.8463313579559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06413746, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05519054049820913, + "language_loss": 0.86099815, + "learning_rate": 0.000663016439797172, + "loss": 0.87191272, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.2734375, + "step": 2144, + "time_per_iteration": 2.611057996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.05917096, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.07082455066013048, + "language_loss": 0.80582112, + "learning_rate": 0.0006627218890228724, + "loss": 0.81669062, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.27783203, + "step": 2145, + "time_per_iteration": 2.8047831058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.05859172, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.08398112437337095, + "language_loss": 0.83330071, + "learning_rate": 0.0006624272750743326, + "loss": 0.84417343, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.28637695, + "step": 2146, + "time_per_iteration": 2.9890313148498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081748, + "balance_loss_mlp": 1.05299461, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.12117217429962603, + "language_loss": 0.82466137, + "learning_rate": 0.0006621325980659322, + "loss": 0.83547878, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.2878418, + "step": 2147, + "time_per_iteration": 2.7945189476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.05475557, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.05729870278054163, + "language_loss": 0.81810451, + "learning_rate": 0.000661837858112075, + "loss": 0.82893538, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.28320312, + "step": 2148, + "time_per_iteration": 2.833590030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05102634, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.05837233957282785, + "language_loss": 0.88857764, + "learning_rate": 0.0006615430553271888, + "loss": 0.89937091, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.28344727, + "step": 2149, + "time_per_iteration": 2.75384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04603195, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.06498878822354702, + "language_loss": 0.85069597, + "learning_rate": 0.0006612481898257264, + "loss": 0.86143911, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.28295898, + "step": 2150, + "time_per_iteration": 2.8471391201019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.04901028, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.06146250241107021, + "language_loss": 0.85024071, + "learning_rate": 0.000660953261722165, + "loss": 0.8610152, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.28442383, + "step": 2151, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04643118, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.07635609550069686, + "language_loss": 0.82408941, + "learning_rate": 0.0006606582711310055, + "loss": 0.8348453, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.29150391, + "step": 2152, + "time_per_iteration": 2.707353353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079486, + "balance_loss_mlp": 1.05068457, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.05643811624839042, + "language_loss": 0.83234471, + "learning_rate": 0.0006603632181667736, + "loss": 0.84313959, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.2878418, + "step": 2153, + "time_per_iteration": 2.6824803352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_mlp": 1.02085698, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.02554992861291058, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79978293, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.14160156, + "step": 2154, + "time_per_iteration": 4.893488645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075294, + "balance_loss_mlp": 1.04625416, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.06235301652291857, + "language_loss": 0.81530857, + "learning_rate": 0.0006597729255773153, + "loss": 0.82606155, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.2902832, + "step": 2155, + "time_per_iteration": 2.526531934738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084546, + "balance_loss_mlp": 1.05519629, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.06680223734216864, + "language_loss": 0.82554018, + "learning_rate": 0.0006594776861812608, + "loss": 0.83638561, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.29321289, + "step": 2156, + "time_per_iteration": 2.669290065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_mlp": 1.05525446, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.05896575190253656, + "language_loss": 0.8669672, + "learning_rate": 0.0006591823848704776, + "loss": 0.87780631, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.28613281, + "step": 2157, + "time_per_iteration": 2.9277596473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081796, + "balance_loss_mlp": 1.05273294, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.07853922010281017, + "language_loss": 0.81488264, + "learning_rate": 0.0006588870217596117, + "loss": 0.82570058, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.29003906, + "step": 2158, + "time_per_iteration": 2.72590970993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107553, + "balance_loss_mlp": 1.04572749, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.06749140584983894, + "language_loss": 0.86219651, + "learning_rate": 0.0006585915969633334, + "loss": 0.87295187, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.29760742, + "step": 2159, + "time_per_iteration": 2.609668731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068571, + "balance_loss_mlp": 1.03838706, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.0643598430263329, + "language_loss": 0.89336061, + "learning_rate": 0.0006582961105963366, + "loss": 0.90404636, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.30151367, + "step": 2160, + "time_per_iteration": 2.814122200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04409909, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.0615363131016327, + "language_loss": 0.77864838, + "learning_rate": 0.0006580005627733395, + "loss": 0.78939116, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.30126953, + "step": 2161, + "time_per_iteration": 2.693002700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03790569, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.07091162327263066, + "language_loss": 0.81523043, + "learning_rate": 0.0006577049536090838, + "loss": 0.82590109, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.29125977, + "step": 2162, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.04039741, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07952336976051765, + "language_loss": 0.85617888, + "learning_rate": 0.000657409283218335, + "loss": 0.86688089, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.29760742, + "step": 2163, + "time_per_iteration": 2.663069486618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.04075933, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.06199265882265987, + "language_loss": 0.81197548, + "learning_rate": 0.0006571135517158829, + "loss": 0.82267773, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.29394531, + "step": 2164, + "time_per_iteration": 2.6750965118408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_mlp": 1.03042102, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.030179808177232596, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807546, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.13085938, + "step": 2165, + "time_per_iteration": 4.7519471645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.0417223, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.06526247046532782, + "language_loss": 0.83270538, + "learning_rate": 0.0006565219058351444, + "loss": 0.84342444, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.30151367, + "step": 2166, + "time_per_iteration": 2.5784192085266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.04080534, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.06219532105294632, + "language_loss": 0.82938039, + "learning_rate": 0.0006562259916865553, + "loss": 0.84009004, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.30102539, + "step": 2167, + "time_per_iteration": 2.59431791305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073926, + "balance_loss_mlp": 1.04369497, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.06573475594481314, + "language_loss": 0.7943427, + "learning_rate": 0.0006559300168856573, + "loss": 0.80508196, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.30175781, + "step": 2168, + "time_per_iteration": 2.727644443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070483, + "balance_loss_mlp": 1.04046655, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.17889612534981147, + "language_loss": 0.85705924, + "learning_rate": 0.0006556339815473577, + "loss": 0.86776412, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.29980469, + "step": 2169, + "time_per_iteration": 2.6300487518310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072561, + "balance_loss_mlp": 1.04366493, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.053042429294564375, + "language_loss": 0.86056256, + "learning_rate": 0.000655337885786588, + "loss": 0.87128818, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.2890625, + "step": 2170, + "time_per_iteration": 2.8887124061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081102, + "balance_loss_mlp": 1.05139482, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.08227745310603136, + "language_loss": 0.84896123, + "learning_rate": 0.0006550417297183025, + "loss": 0.85977226, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.29663086, + "step": 2171, + "time_per_iteration": 2.6285011768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088317, + "balance_loss_mlp": 1.05894339, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.05761128029173598, + "language_loss": 0.81863701, + "learning_rate": 0.0006547455134574793, + "loss": 0.82952011, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.29321289, + "step": 2172, + "time_per_iteration": 2.7729623317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.06040442, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.06792239619892874, + "language_loss": 0.83893955, + "learning_rate": 0.0006544492371191198, + "loss": 0.84983015, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.28613281, + "step": 2173, + "time_per_iteration": 3.1256158351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094435, + "balance_loss_mlp": 1.06477547, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.05504184984792058, + "language_loss": 0.83198339, + "learning_rate": 0.0006541529008182485, + "loss": 0.84292769, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.29638672, + "step": 2174, + "time_per_iteration": 3.207711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.0648396, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.07199426026259947, + "language_loss": 0.87529659, + "learning_rate": 0.0006538565046699136, + "loss": 0.88623327, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.28808594, + "step": 2175, + "time_per_iteration": 2.5804800987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090181, + "balance_loss_mlp": 1.06207108, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.06367136059390696, + "language_loss": 0.80982441, + "learning_rate": 0.0006535600487891862, + "loss": 0.82072628, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.28149414, + "step": 2176, + "time_per_iteration": 2.7804555892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087535, + "balance_loss_mlp": 1.05870986, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05631892460787088, + "language_loss": 0.89099276, + "learning_rate": 0.0006532635332911603, + "loss": 0.9018681, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.28808594, + "step": 2177, + "time_per_iteration": 2.641392707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083587, + "balance_loss_mlp": 1.05428553, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.06086903625614387, + "language_loss": 0.80636132, + "learning_rate": 0.0006529669582909541, + "loss": 0.8171972, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.29296875, + "step": 2178, + "time_per_iteration": 3.2258243560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079831, + "balance_loss_mlp": 1.0508393, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06798611784395944, + "language_loss": 0.85681045, + "learning_rate": 0.0006526703239037077, + "loss": 0.86760873, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.28955078, + "step": 2179, + "time_per_iteration": 2.66808819770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0480361, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.06231650691948033, + "language_loss": 0.86236274, + "learning_rate": 0.0006523736302445851, + "loss": 0.87313515, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.29174805, + "step": 2180, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04490554, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05646655403971755, + "language_loss": 0.77122605, + "learning_rate": 0.0006520768774287728, + "loss": 0.78197432, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.29882812, + "step": 2181, + "time_per_iteration": 3.7851996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077657, + "balance_loss_mlp": 1.04899919, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.05195874321999793, + "language_loss": 0.85622293, + "learning_rate": 0.0006517800655714806, + "loss": 0.86699945, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.28686523, + "step": 2182, + "time_per_iteration": 2.8000948429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.05359161, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.06393427474455515, + "language_loss": 0.85246432, + "learning_rate": 0.0006514831947879407, + "loss": 0.86329615, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.2956543, + "step": 2183, + "time_per_iteration": 2.946345329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090824, + "balance_loss_mlp": 1.06164193, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05990675678964555, + "language_loss": 0.78013611, + "learning_rate": 0.0006511862651934091, + "loss": 0.79104435, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.29174805, + "step": 2184, + "time_per_iteration": 3.043314218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087348, + "balance_loss_mlp": 1.05797458, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.05608517861748944, + "language_loss": 0.82263517, + "learning_rate": 0.0006508892769031638, + "loss": 0.83350861, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.29345703, + "step": 2185, + "time_per_iteration": 2.662071704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090134, + "balance_loss_mlp": 1.06052232, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.07931700187887496, + "language_loss": 0.86476076, + "learning_rate": 0.000650592230032506, + "loss": 0.87566209, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.2956543, + "step": 2186, + "time_per_iteration": 2.758989095687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094562, + "balance_loss_mlp": 1.06464052, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.06900651751722174, + "language_loss": 0.84912258, + "learning_rate": 0.0006502951246967595, + "loss": 0.8600682, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.29882812, + "step": 2187, + "time_per_iteration": 2.9305953979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.06274199, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.061550495040686125, + "language_loss": 0.86992055, + "learning_rate": 0.0006499979610112706, + "loss": 0.88084006, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.29150391, + "step": 2188, + "time_per_iteration": 2.6826889514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091259, + "balance_loss_mlp": 1.06205249, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05090003048385584, + "language_loss": 0.84021527, + "learning_rate": 0.000649700739091409, + "loss": 0.85112786, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.29125977, + "step": 2189, + "time_per_iteration": 2.7169277667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.04628468, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.03212522571547254, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74894285, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.1171875, + "step": 2190, + "time_per_iteration": 4.8044211864471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094227, + "balance_loss_mlp": 1.06645083, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.05853660814181512, + "language_loss": 0.85258055, + "learning_rate": 0.0006491061210101557, + "loss": 0.86352277, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.27832031, + "step": 2191, + "time_per_iteration": 2.6850759983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093463, + "balance_loss_mlp": 1.06554449, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.05791259848064641, + "language_loss": 0.84111977, + "learning_rate": 0.0006488087250796157, + "loss": 0.85205436, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.27905273, + "step": 2192, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099215, + "balance_loss_mlp": 1.07148743, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.0649444731235166, + "language_loss": 0.81518376, + "learning_rate": 0.0006485112713764049, + "loss": 0.82617593, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.27734375, + "step": 2193, + "time_per_iteration": 2.910949468612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102268, + "balance_loss_mlp": 1.07523096, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.07813881123096035, + "language_loss": 0.83433115, + "learning_rate": 0.0006482137600160051, + "loss": 0.84535384, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.27075195, + "step": 2194, + "time_per_iteration": 2.5086262226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096994, + "balance_loss_mlp": 1.06900394, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.07794223585413998, + "language_loss": 0.84987926, + "learning_rate": 0.0006479161911139206, + "loss": 0.86084926, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.2800293, + "step": 2195, + "time_per_iteration": 2.5875346660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109264, + "balance_loss_mlp": 1.06493604, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.07304716613473786, + "language_loss": 0.85472345, + "learning_rate": 0.0006476185647856778, + "loss": 0.86564982, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.27734375, + "step": 2196, + "time_per_iteration": 2.5596694946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083263, + "balance_loss_mlp": 1.05589223, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.0787732151202365, + "language_loss": 0.81599677, + "learning_rate": 0.0006473208811468255, + "loss": 0.82682943, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.27416992, + "step": 2197, + "time_per_iteration": 2.8756632804870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.05518579, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05582038208417147, + "language_loss": 0.84304923, + "learning_rate": 0.0006470231403129347, + "loss": 0.85387599, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.27490234, + "step": 2198, + "time_per_iteration": 2.6008548736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082097, + "balance_loss_mlp": 1.05444098, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.05486589756973033, + "language_loss": 0.81627637, + "learning_rate": 0.0006467253423995988, + "loss": 0.8270973, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.27685547, + "step": 2199, + "time_per_iteration": 2.8359298706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085734, + "balance_loss_mlp": 1.05788624, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.06443704109820439, + "language_loss": 0.79415488, + "learning_rate": 0.000646427487522433, + "loss": 0.80501223, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.27880859, + "step": 2200, + "time_per_iteration": 2.6884772777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089933, + "balance_loss_mlp": 1.06251502, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.06462007516901433, + "language_loss": 0.83460814, + "learning_rate": 0.0006461295757970749, + "loss": 0.8455075, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.27441406, + "step": 2201, + "time_per_iteration": 2.7960758209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_mlp": 1.07140875, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.08363319364773283, + "language_loss": 0.81312859, + "learning_rate": 0.0006458316073391839, + "loss": 0.82413375, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.29101562, + "step": 2202, + "time_per_iteration": 2.853297472000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096557, + "balance_loss_mlp": 1.06830478, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.0711769658628502, + "language_loss": 0.87750852, + "learning_rate": 0.0006455335822644422, + "loss": 0.88847411, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.28271484, + "step": 2203, + "time_per_iteration": 2.6077048778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110502, + "balance_loss_mlp": 1.07607579, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.061615225293076246, + "language_loss": 0.77729923, + "learning_rate": 0.0006452355006885527, + "loss": 0.78834939, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.28930664, + "step": 2204, + "time_per_iteration": 2.6517252922058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103628, + "balance_loss_mlp": 1.07442212, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.1220032897030914, + "language_loss": 0.86957574, + "learning_rate": 0.0006449373627272412, + "loss": 0.88061202, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.29199219, + "step": 2205, + "time_per_iteration": 2.7004148960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093739, + "balance_loss_mlp": 1.06515288, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.07705045910796138, + "language_loss": 0.82556224, + "learning_rate": 0.0006446391684962553, + "loss": 0.83649963, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.28588867, + "step": 2206, + "time_per_iteration": 2.6505441665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.05558801, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.0589868983385633, + "language_loss": 0.82958955, + "learning_rate": 0.000644340918111364, + "loss": 0.84042698, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.28149414, + "step": 2207, + "time_per_iteration": 2.6410183906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079008, + "balance_loss_mlp": 1.05011129, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05680611388250626, + "language_loss": 0.84805965, + "learning_rate": 0.0006440426116883585, + "loss": 0.8588497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.28857422, + "step": 2208, + "time_per_iteration": 2.5708625316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074083, + "balance_loss_mlp": 1.04478097, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.06224422813064936, + "language_loss": 0.86093891, + "learning_rate": 0.0006437442493430519, + "loss": 0.87167978, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.29248047, + "step": 2209, + "time_per_iteration": 2.70894718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074378, + "balance_loss_mlp": 1.04481411, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.07482969618411565, + "language_loss": 0.86115217, + "learning_rate": 0.000643445831191278, + "loss": 0.87189603, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.29492188, + "step": 2210, + "time_per_iteration": 2.924381971359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076507, + "balance_loss_mlp": 1.0465858, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.07331466132736943, + "language_loss": 0.81421846, + "learning_rate": 0.0006431473573488937, + "loss": 0.82498354, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.29882812, + "step": 2211, + "time_per_iteration": 2.7787976264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.04380631, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.07883329281510759, + "language_loss": 0.84917492, + "learning_rate": 0.0006428488279317765, + "loss": 0.85990787, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.29443359, + "step": 2212, + "time_per_iteration": 2.6664369106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070733, + "balance_loss_mlp": 1.04052496, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.06306745469338368, + "language_loss": 0.87706983, + "learning_rate": 0.0006425502430558259, + "loss": 0.88777709, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.30151367, + "step": 2213, + "time_per_iteration": 2.6229989528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04106641, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.0655798606724697, + "language_loss": 0.84705913, + "learning_rate": 0.0006422516028369628, + "loss": 0.8577702, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.30004883, + "step": 2214, + "time_per_iteration": 2.69012451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072564, + "balance_loss_mlp": 1.04197454, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.08051577462794157, + "language_loss": 0.83543354, + "learning_rate": 0.0006419529073911296, + "loss": 0.84615922, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.30541992, + "step": 2215, + "time_per_iteration": 2.873396873474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070818, + "balance_loss_mlp": 1.03987157, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05918367623789858, + "language_loss": 0.85362011, + "learning_rate": 0.0006416541568342901, + "loss": 0.86432827, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.30908203, + "step": 2216, + "time_per_iteration": 2.870213508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071511, + "balance_loss_mlp": 1.04161358, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.06028802274016953, + "language_loss": 0.8413707, + "learning_rate": 0.0006413553512824297, + "loss": 0.85208583, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.29858398, + "step": 2217, + "time_per_iteration": 2.7570102214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066011, + "balance_loss_mlp": 1.03599358, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.06136950817587928, + "language_loss": 0.8441695, + "learning_rate": 0.0006410564908515549, + "loss": 0.85482961, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.29980469, + "step": 2218, + "time_per_iteration": 2.634636878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.05945328981992575, + "language_loss": 0.85267186, + "learning_rate": 0.0006407575756576935, + "loss": 0.8633939, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.30957031, + "step": 2219, + "time_per_iteration": 2.7264437675476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076309, + "balance_loss_mlp": 1.04512346, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.08352776642532155, + "language_loss": 0.87413085, + "learning_rate": 0.0006404586058168951, + "loss": 0.88489389, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.31152344, + "step": 2220, + "time_per_iteration": 2.740231513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070252, + "balance_loss_mlp": 1.03906727, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.06337599132559579, + "language_loss": 0.86675316, + "learning_rate": 0.0006401595814452296, + "loss": 0.87745565, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.31152344, + "step": 2221, + "time_per_iteration": 2.595133066177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04316878, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05998559409639075, + "language_loss": 0.80837309, + "learning_rate": 0.000639860502658789, + "loss": 0.81910712, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.30224609, + "step": 2222, + "time_per_iteration": 2.6363143920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078431, + "balance_loss_mlp": 1.04805684, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.051235249414951084, + "language_loss": 0.85047621, + "learning_rate": 0.0006395613695736853, + "loss": 0.86126053, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.3034668, + "step": 2223, + "time_per_iteration": 2.719651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.0574553, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.14370485886555942, + "language_loss": 0.82013905, + "learning_rate": 0.0006392621823060529, + "loss": 0.83102709, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.31347656, + "step": 2224, + "time_per_iteration": 2.707019805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.04968464, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.06727581417341866, + "language_loss": 0.84405053, + "learning_rate": 0.0006389629409720465, + "loss": 0.85485303, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.30541992, + "step": 2225, + "time_per_iteration": 2.6877145767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04415512, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.06967859590672425, + "language_loss": 0.88595277, + "learning_rate": 0.0006386636456878417, + "loss": 0.89670026, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.30566406, + "step": 2226, + "time_per_iteration": 2.87302827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.04344106, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.07126154474787791, + "language_loss": 0.92022073, + "learning_rate": 0.0006383642965696353, + "loss": 0.93095744, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.30175781, + "step": 2227, + "time_per_iteration": 2.4469897747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.04492915, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06843530557124561, + "language_loss": 0.82703793, + "learning_rate": 0.000638064893733645, + "loss": 0.83779144, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.30371094, + "step": 2228, + "time_per_iteration": 2.7728607654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071747, + "balance_loss_mlp": 1.04256451, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.058089035035371744, + "language_loss": 0.89580554, + "learning_rate": 0.000637765437296109, + "loss": 0.90652299, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.29199219, + "step": 2229, + "time_per_iteration": 2.634521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04252505, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.07373798457938027, + "language_loss": 0.85480672, + "learning_rate": 0.000637465927373287, + "loss": 0.86553335, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.30126953, + "step": 2230, + "time_per_iteration": 2.6294057369232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082832, + "balance_loss_mlp": 1.05276728, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.08134114280474665, + "language_loss": 0.79152465, + "learning_rate": 0.000637166364081459, + "loss": 0.80235291, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.30004883, + "step": 2231, + "time_per_iteration": 2.651043176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.04837155, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.0656552791827552, + "language_loss": 0.83965945, + "learning_rate": 0.0006368667475369256, + "loss": 0.85042852, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.28515625, + "step": 2232, + "time_per_iteration": 2.749769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072336, + "balance_loss_mlp": 1.05898428, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.038311067760931045, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79600114, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.13378906, + "step": 2233, + "time_per_iteration": 4.919846773147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_mlp": 1.04044378, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.026216416348918452, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79949123, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.1328125, + "step": 2234, + "time_per_iteration": 4.814115285873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109183, + "balance_loss_mlp": 1.06281483, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.052673535005773216, + "language_loss": 0.85474288, + "learning_rate": 0.0006359675795504112, + "loss": 0.86566114, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.29003906, + "step": 2235, + "time_per_iteration": 2.7002832889556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097467, + "balance_loss_mlp": 1.07021558, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.08125384058814748, + "language_loss": 0.74334383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75431848, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.27294922, + "step": 2236, + "time_per_iteration": 3.472095012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096497, + "balance_loss_mlp": 1.06938839, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.06719636161557083, + "language_loss": 0.85933757, + "learning_rate": 0.0006353678700956511, + "loss": 0.8703025, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.27148438, + "step": 2237, + "time_per_iteration": 2.6188535690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089994, + "balance_loss_mlp": 1.06288612, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.09054713742221257, + "language_loss": 0.83597302, + "learning_rate": 0.0006350679364783569, + "loss": 0.84687304, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.27172852, + "step": 2238, + "time_per_iteration": 2.7403035163879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093799, + "balance_loss_mlp": 1.0661664, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.06694912929746479, + "language_loss": 0.85728157, + "learning_rate": 0.0006347679504230393, + "loss": 0.86821961, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.27661133, + "step": 2239, + "time_per_iteration": 2.652348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087161, + "balance_loss_mlp": 1.05974269, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.056527008755361936, + "language_loss": 0.75895661, + "learning_rate": 0.0006344679120461632, + "loss": 0.7698282, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.27416992, + "step": 2240, + "time_per_iteration": 3.334127187728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091078, + "balance_loss_mlp": 1.06435084, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.1917370324350853, + "language_loss": 0.80061769, + "learning_rate": 0.0006341678214642134, + "loss": 0.81152856, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.26782227, + "step": 2241, + "time_per_iteration": 2.6100823879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087616, + "balance_loss_mlp": 1.06103277, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06088249389193946, + "language_loss": 0.82893783, + "learning_rate": 0.0006338676787936963, + "loss": 0.83981395, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.26635742, + "step": 2242, + "time_per_iteration": 3.077916383743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.07142353, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.060062439107852666, + "language_loss": 0.8377043, + "learning_rate": 0.0006335674841511367, + "loss": 0.84868383, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.26586914, + "step": 2243, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05415499, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03077915513708162, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80247629, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.12255859, + "step": 2244, + "time_per_iteration": 5.000265121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060995, + "balance_loss_mlp": 1.04878819, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.03064763148494063, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7842654, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.12207031, + "step": 2245, + "time_per_iteration": 4.9160850048065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093506, + "balance_loss_mlp": 1.06594431, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.06803490831657065, + "language_loss": 0.82597309, + "learning_rate": 0.0006326665895567652, + "loss": 0.83690816, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.2755127, + "step": 2246, + "time_per_iteration": 2.6449503898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.05649078, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.07553831830843152, + "language_loss": 0.87537026, + "learning_rate": 0.0006323661881916976, + "loss": 0.88621694, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.28173828, + "step": 2247, + "time_per_iteration": 2.699899911880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088894, + "balance_loss_mlp": 1.05983043, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.05605692822142187, + "language_loss": 0.80999863, + "learning_rate": 0.0006320657354375179, + "loss": 0.82088757, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.2902832, + "step": 2248, + "time_per_iteration": 2.9737963676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.05374026, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.1777496827938913, + "language_loss": 0.87151104, + "learning_rate": 0.0006317652314108726, + "loss": 0.88234049, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.29150391, + "step": 2249, + "time_per_iteration": 2.5640759468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076296, + "balance_loss_mlp": 1.04782867, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.059764616303547735, + "language_loss": 0.91275859, + "learning_rate": 0.0006314646762284277, + "loss": 0.92352152, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.28442383, + "step": 2250, + "time_per_iteration": 2.6878976821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056511, + "balance_loss_mlp": 1.04401791, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.026928771485436313, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76482344, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.125, + "step": 2251, + "time_per_iteration": 4.839360475540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079121, + "balance_loss_mlp": 1.04931927, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.05685438588579276, + "language_loss": 0.77368456, + "learning_rate": 0.0006308634128629022, + "loss": 0.78447574, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.29785156, + "step": 2252, + "time_per_iteration": 2.895348072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_mlp": 1.05426395, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.07214959985253801, + "language_loss": 0.87411779, + "learning_rate": 0.0006305627049132531, + "loss": 0.88495201, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.29125977, + "step": 2253, + "time_per_iteration": 2.8069100379943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.05440617, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.059293193490882155, + "language_loss": 0.85926008, + "learning_rate": 0.0006302619462746662, + "loss": 0.87009549, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.29101562, + "step": 2254, + "time_per_iteration": 3.1606533527374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.05193734, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05505451724174187, + "language_loss": 0.89697909, + "learning_rate": 0.0006299611370639069, + "loss": 0.90777981, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.28149414, + "step": 2255, + "time_per_iteration": 2.734578847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05368638, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.06498253441528982, + "language_loss": 0.79077351, + "learning_rate": 0.0006296602773977593, + "loss": 0.80159676, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.28637695, + "step": 2256, + "time_per_iteration": 2.7210190296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_mlp": 1.0577755, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.06552918038966793, + "language_loss": 0.87430996, + "learning_rate": 0.0006293593673930277, + "loss": 0.88517857, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.2902832, + "step": 2257, + "time_per_iteration": 2.6526098251342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087005, + "balance_loss_mlp": 1.05851448, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.06677812911461618, + "language_loss": 0.78416431, + "learning_rate": 0.0006290584071665358, + "loss": 0.79503441, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.28491211, + "step": 2258, + "time_per_iteration": 2.915259838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_mlp": 1.0575645, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.06990053073214272, + "language_loss": 0.81982124, + "learning_rate": 0.0006287573968351266, + "loss": 0.83068204, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.28515625, + "step": 2259, + "time_per_iteration": 2.5836570262908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082362, + "balance_loss_mlp": 1.05432403, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06494033905479386, + "language_loss": 0.82220829, + "learning_rate": 0.0006284563365156626, + "loss": 0.83303189, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.28076172, + "step": 2260, + "time_per_iteration": 2.815223217010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_mlp": 1.05620956, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.07047722124208498, + "language_loss": 0.87564874, + "learning_rate": 0.0006281552263250261, + "loss": 0.88649434, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.28344727, + "step": 2261, + "time_per_iteration": 2.4715116024017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_mlp": 1.04964256, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.023387556142435376, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81753576, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.12402344, + "step": 2262, + "time_per_iteration": 4.811767101287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084425, + "balance_loss_mlp": 1.05641103, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.062970719214795, + "language_loss": 0.81474411, + "learning_rate": 0.0006275528567978593, + "loss": 0.82558835, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.28051758, + "step": 2263, + "time_per_iteration": 2.9182233810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096573, + "balance_loss_mlp": 1.06877375, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.06472545743832298, + "language_loss": 0.82352197, + "learning_rate": 0.0006272515976951898, + "loss": 0.83448768, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.27832031, + "step": 2264, + "time_per_iteration": 3.137770175933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097325, + "balance_loss_mlp": 1.06852436, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.055887733519337984, + "language_loss": 0.79332447, + "learning_rate": 0.0006269502891890687, + "loss": 0.8042978, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.28759766, + "step": 2265, + "time_per_iteration": 2.9932398796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093111, + "balance_loss_mlp": 1.06526363, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.06217907852457908, + "language_loss": 0.87852293, + "learning_rate": 0.0006266489313964743, + "loss": 0.88945401, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.27880859, + "step": 2266, + "time_per_iteration": 2.720874547958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.06338787, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.05517220152754215, + "language_loss": 0.85363281, + "learning_rate": 0.0006263475244344041, + "loss": 0.86454159, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.27514648, + "step": 2267, + "time_per_iteration": 2.8508987426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089804, + "balance_loss_mlp": 1.06178975, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.061658084399303315, + "language_loss": 0.84817886, + "learning_rate": 0.0006260460684198746, + "loss": 0.85907692, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.28027344, + "step": 2268, + "time_per_iteration": 2.6972851753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091639, + "balance_loss_mlp": 1.06395864, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.07163404822705746, + "language_loss": 0.84593827, + "learning_rate": 0.0006257445634699213, + "loss": 0.85685468, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.27734375, + "step": 2269, + "time_per_iteration": 2.562509298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05565524, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.07106993063326117, + "language_loss": 0.82829607, + "learning_rate": 0.0006254430097015993, + "loss": 0.8391344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.28222656, + "step": 2270, + "time_per_iteration": 2.6713523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054528, + "balance_loss_mlp": 1.04203498, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029151500829202304, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77533615, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.125, + "step": 2271, + "time_per_iteration": 4.761755466461182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_mlp": 1.05801725, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05590316940209524, + "language_loss": 0.85155964, + "learning_rate": 0.0006248397561781609, + "loss": 0.86242455, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.28491211, + "step": 2272, + "time_per_iteration": 2.8541359901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091334, + "balance_loss_mlp": 1.06246173, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.07335127222093174, + "language_loss": 0.8601104, + "learning_rate": 0.0006245380566572482, + "loss": 0.87102377, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.28857422, + "step": 2273, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090326, + "balance_loss_mlp": 1.06200182, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06592567136619501, + "language_loss": 0.76039565, + "learning_rate": 0.0006242363087863744, + "loss": 0.77129889, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.28344727, + "step": 2274, + "time_per_iteration": 2.9512767791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089474, + "balance_loss_mlp": 1.06129336, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.07045204489750885, + "language_loss": 0.86392975, + "learning_rate": 0.0006239345126826878, + "loss": 0.87482452, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.28198242, + "step": 2275, + "time_per_iteration": 2.818574905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081719, + "balance_loss_mlp": 1.05236995, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.06271142699552738, + "language_loss": 0.8405596, + "learning_rate": 0.0006236326684633561, + "loss": 0.85137677, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.29296875, + "step": 2276, + "time_per_iteration": 2.8501060009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088499, + "balance_loss_mlp": 1.05972195, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.08224081940065299, + "language_loss": 0.75057948, + "learning_rate": 0.0006233307762455658, + "loss": 0.76146448, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.28735352, + "step": 2277, + "time_per_iteration": 2.6692187786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079787, + "balance_loss_mlp": 1.05098617, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.1351794781054828, + "language_loss": 0.83103114, + "learning_rate": 0.0006230288361465216, + "loss": 0.84182906, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.2878418, + "step": 2278, + "time_per_iteration": 3.0566518306732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081672, + "balance_loss_mlp": 1.05389631, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0635725084076576, + "language_loss": 0.85047072, + "learning_rate": 0.0006227268482834473, + "loss": 0.86128747, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.27783203, + "step": 2279, + "time_per_iteration": 2.890195608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086149, + "balance_loss_mlp": 1.05811095, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06574285370830908, + "language_loss": 0.87371957, + "learning_rate": 0.000622424812773585, + "loss": 0.88458109, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.28076172, + "step": 2280, + "time_per_iteration": 2.820857524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_mlp": 1.05698299, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.08150674529849485, + "language_loss": 0.80050623, + "learning_rate": 0.000622122729734195, + "loss": 0.81135261, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.27685547, + "step": 2281, + "time_per_iteration": 2.5578882694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090722, + "balance_loss_mlp": 1.06320858, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.05652917217777931, + "language_loss": 0.87423271, + "learning_rate": 0.0006218205992825566, + "loss": 0.88513994, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.27539062, + "step": 2282, + "time_per_iteration": 2.6367194652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05989254, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.06387466426791162, + "language_loss": 0.81580615, + "learning_rate": 0.0006215184215359671, + "loss": 0.82668239, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.27758789, + "step": 2283, + "time_per_iteration": 2.7550642490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109022, + "balance_loss_mlp": 1.06254005, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.06853375358246538, + "language_loss": 0.86762869, + "learning_rate": 0.0006212161966117425, + "loss": 0.87853086, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.27709961, + "step": 2284, + "time_per_iteration": 2.7315139770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093132, + "balance_loss_mlp": 1.06492722, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.06833018750237568, + "language_loss": 0.81347001, + "learning_rate": 0.0006209139246272164, + "loss": 0.82440132, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.28222656, + "step": 2285, + "time_per_iteration": 2.997727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085597, + "balance_loss_mlp": 1.0573678, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.0627571888999813, + "language_loss": 0.81454128, + "learning_rate": 0.0006206116056997421, + "loss": 0.82539719, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.28271484, + "step": 2286, + "time_per_iteration": 2.5523786544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092851, + "balance_loss_mlp": 1.06512272, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.0569936252584843, + "language_loss": 0.82580131, + "learning_rate": 0.0006203092399466892, + "loss": 0.83672982, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.27783203, + "step": 2287, + "time_per_iteration": 2.5256903171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080971, + "balance_loss_mlp": 1.05317175, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.052620788715243595, + "language_loss": 0.85130596, + "learning_rate": 0.0006200068274854473, + "loss": 0.86211562, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.27832031, + "step": 2288, + "time_per_iteration": 2.6666431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089786, + "balance_loss_mlp": 1.06108057, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.05493211856459023, + "language_loss": 0.85969126, + "learning_rate": 0.0006197043684334229, + "loss": 0.87058908, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.28686523, + "step": 2289, + "time_per_iteration": 2.7558815479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093604, + "balance_loss_mlp": 1.0652802, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.06713172204070075, + "language_loss": 0.7966578, + "learning_rate": 0.0006194018629080411, + "loss": 0.80759388, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.28344727, + "step": 2290, + "time_per_iteration": 2.7641310691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095567, + "balance_loss_mlp": 1.06721866, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.06308142018549157, + "language_loss": 0.81759441, + "learning_rate": 0.0006190993110267451, + "loss": 0.8285501, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.28393555, + "step": 2291, + "time_per_iteration": 2.759451389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087327, + "balance_loss_mlp": 1.05959892, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.0663089643389441, + "language_loss": 0.84395695, + "learning_rate": 0.0006187967129069958, + "loss": 0.85483021, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.27758789, + "step": 2292, + "time_per_iteration": 2.5458216667175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.06011844, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05260179709926624, + "language_loss": 0.8707509, + "learning_rate": 0.0006184940686662722, + "loss": 0.88162768, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.27612305, + "step": 2293, + "time_per_iteration": 2.7694880962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05494058, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.055518519655343164, + "language_loss": 0.90020764, + "learning_rate": 0.0006181913784220714, + "loss": 0.91103435, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.27758789, + "step": 2294, + "time_per_iteration": 2.6642205715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.03542924, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.024577707308588242, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81601226, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.12011719, + "step": 2295, + "time_per_iteration": 4.874637842178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084239, + "balance_loss_mlp": 1.05665421, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06513424306559527, + "language_loss": 0.79833972, + "learning_rate": 0.0006175858603933146, + "loss": 0.80918217, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.27612305, + "step": 2296, + "time_per_iteration": 2.9130241870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084408, + "balance_loss_mlp": 1.05665636, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.06251545633736988, + "language_loss": 0.80774343, + "learning_rate": 0.0006172830328438416, + "loss": 0.81858754, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.27783203, + "step": 2297, + "time_per_iteration": 2.953983783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.05460715, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.057534365085963636, + "language_loss": 0.86889625, + "learning_rate": 0.0006169801597610572, + "loss": 0.87972271, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.28051758, + "step": 2298, + "time_per_iteration": 2.7841529846191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087234, + "balance_loss_mlp": 1.05986333, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.0717755554401909, + "language_loss": 0.89631718, + "learning_rate": 0.0006166772412625469, + "loss": 0.90718955, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.27416992, + "step": 2299, + "time_per_iteration": 2.7750232219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087463, + "balance_loss_mlp": 1.05983019, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06473860012868299, + "language_loss": 0.81551421, + "learning_rate": 0.0006163742774659141, + "loss": 0.82638884, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.27661133, + "step": 2300, + "time_per_iteration": 2.8384482860565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092146, + "balance_loss_mlp": 1.06446528, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.0850959758091444, + "language_loss": 0.85627389, + "learning_rate": 0.0006160712684887801, + "loss": 0.86719531, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.27709961, + "step": 2301, + "time_per_iteration": 2.7603278160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_mlp": 1.05813527, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.053898588417471735, + "language_loss": 0.81867981, + "learning_rate": 0.0006157682144487832, + "loss": 0.82952744, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.2668457, + "step": 2302, + "time_per_iteration": 2.7585275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090771, + "balance_loss_mlp": 1.06347191, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.05970343490953875, + "language_loss": 0.82821, + "learning_rate": 0.0006154651154635793, + "loss": 0.83911771, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.2734375, + "step": 2303, + "time_per_iteration": 4.252831697463989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097367, + "balance_loss_mlp": 1.07040215, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05697892496442649, + "language_loss": 0.8468399, + "learning_rate": 0.0006151619716508421, + "loss": 0.85781354, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.27026367, + "step": 2304, + "time_per_iteration": 2.5882937908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_mlp": 1.07442617, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.06572201075979017, + "language_loss": 0.86751652, + "learning_rate": 0.0006148587831282625, + "loss": 0.87853855, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.27807617, + "step": 2305, + "time_per_iteration": 2.6605563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.04066956, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.01894914693526954, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.802288, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12060547, + "step": 2306, + "time_per_iteration": 4.910472631454468 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_mlp": 1.07342601, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06457533715620843, + "language_loss": 0.87372738, + "learning_rate": 0.0006142522724244255, + "loss": 0.88474846, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.28686523, + "step": 2307, + "time_per_iteration": 2.5184578895568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_mlp": 1.03508484, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.015440750347127817, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.7753191, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12109375, + "step": 2308, + "time_per_iteration": 4.880531549453735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_mlp": 1.07668638, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.0625118895390298, + "language_loss": 0.77304882, + "learning_rate": 0.000613645584293942, + "loss": 0.78409487, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.27954102, + "step": 2309, + "time_per_iteration": 2.888929605484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_mlp": 1.07522511, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.05626484670913178, + "language_loss": 0.82863319, + "learning_rate": 0.0006133421739881185, + "loss": 0.83965981, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.27441406, + "step": 2310, + "time_per_iteration": 2.6770823001861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098373, + "balance_loss_mlp": 1.06966734, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.09114290921538859, + "language_loss": 0.82713985, + "learning_rate": 0.0006130387196789605, + "loss": 0.83812356, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.28686523, + "step": 2311, + "time_per_iteration": 2.7363758087158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_mlp": 1.07309198, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.05056880651601303, + "language_loss": 0.84359384, + "learning_rate": 0.0006127352214842795, + "loss": 0.85461748, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.29272461, + "step": 2312, + "time_per_iteration": 3.0277068614959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.06688845, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.06767648502511064, + "language_loss": 0.85424733, + "learning_rate": 0.0006124316795219041, + "loss": 0.8652035, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.28710938, + "step": 2313, + "time_per_iteration": 2.7824032306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.05996561, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.06031488841862457, + "language_loss": 0.8232829, + "learning_rate": 0.0006121280939096794, + "loss": 0.83416176, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.27905273, + "step": 2314, + "time_per_iteration": 2.7414164543151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05901051, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.056993316738708576, + "language_loss": 0.8765316, + "learning_rate": 0.000611824464765468, + "loss": 0.88740778, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.28613281, + "step": 2315, + "time_per_iteration": 2.5894503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020326, + "balance_loss_mlp": 1.00830936, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.018109298143921163, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79615265, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.12011719, + "step": 2316, + "time_per_iteration": 4.654959201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081165, + "balance_loss_mlp": 1.05322254, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.05658516719934989, + "language_loss": 0.85440743, + "learning_rate": 0.000611217076352619, + "loss": 0.86521906, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.27978516, + "step": 2317, + "time_per_iteration": 2.8710198402404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086137, + "balance_loss_mlp": 1.05862343, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.062250172980488426, + "language_loss": 0.82876933, + "learning_rate": 0.0006109133173197905, + "loss": 0.8396306, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.27539062, + "step": 2318, + "time_per_iteration": 2.7298824787139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05986071, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.0706297628000491, + "language_loss": 0.85633492, + "learning_rate": 0.0006106095152265935, + "loss": 0.8672179, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.28466797, + "step": 2319, + "time_per_iteration": 2.8895695209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108895, + "balance_loss_mlp": 1.06086433, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.04876785494191262, + "language_loss": 0.84747481, + "learning_rate": 0.0006103056701909739, + "loss": 0.85836434, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.28125, + "step": 2320, + "time_per_iteration": 2.9117228984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108858, + "balance_loss_mlp": 1.05935025, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.06765559983355682, + "language_loss": 0.82841372, + "learning_rate": 0.0006100017823308956, + "loss": 0.8392995, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.29199219, + "step": 2321, + "time_per_iteration": 3.19189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095794, + "balance_loss_mlp": 1.06618226, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.07493928757304909, + "language_loss": 0.796121, + "learning_rate": 0.0006096978517643377, + "loss": 0.80707896, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.29589844, + "step": 2322, + "time_per_iteration": 2.7803642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088319, + "balance_loss_mlp": 1.05825448, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.05979787162997368, + "language_loss": 0.83128643, + "learning_rate": 0.0006093938786092968, + "loss": 0.84216964, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.30029297, + "step": 2323, + "time_per_iteration": 2.6324985027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_mlp": 1.05403399, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0696967897289199, + "language_loss": 0.89752465, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836924, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.30395508, + "step": 2324, + "time_per_iteration": 2.833986282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.05073011, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05715713314103227, + "language_loss": 0.87296605, + "learning_rate": 0.0006087858050058337, + "loss": 0.88377976, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3059082, + "step": 2325, + "time_per_iteration": 2.8220982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082075, + "balance_loss_mlp": 1.05084252, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06405768205874736, + "language_loss": 0.82704103, + "learning_rate": 0.0006084817047934866, + "loss": 0.83786178, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.31225586, + "step": 2326, + "time_per_iteration": 2.6844918727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077775, + "balance_loss_mlp": 1.04635119, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.06718825176833507, + "language_loss": 0.89515507, + "learning_rate": 0.0006081775624648066, + "loss": 0.90593284, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.31396484, + "step": 2327, + "time_per_iteration": 2.5115904808044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.04937041, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.06388622036462539, + "language_loss": 0.82659936, + "learning_rate": 0.0006078733781378721, + "loss": 0.83740276, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.30957031, + "step": 2328, + "time_per_iteration": 2.5578174591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.04003251, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05909371510774122, + "language_loss": 0.82426572, + "learning_rate": 0.0006075691519307781, + "loss": 0.83497119, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.3046875, + "step": 2329, + "time_per_iteration": 2.9271137714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04025745, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.0899878860138525, + "language_loss": 0.81604564, + "learning_rate": 0.0006072648839616356, + "loss": 0.8267594, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.31103516, + "step": 2330, + "time_per_iteration": 2.642164945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069213, + "balance_loss_mlp": 1.03805184, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.05660389796161562, + "language_loss": 0.82544589, + "learning_rate": 0.0006069605743485718, + "loss": 0.83613807, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3112793, + "step": 2331, + "time_per_iteration": 3.3559155464172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_mlp": 1.04945791, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.06166347857347268, + "language_loss": 0.83528912, + "learning_rate": 0.0006066562232097303, + "loss": 0.84607553, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.29125977, + "step": 2332, + "time_per_iteration": 2.7531135082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107678, + "balance_loss_mlp": 1.0468111, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.0526351904833897, + "language_loss": 0.86127633, + "learning_rate": 0.0006063518306632708, + "loss": 0.87204421, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.29907227, + "step": 2333, + "time_per_iteration": 2.957057476043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080344, + "balance_loss_mlp": 1.05044627, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.07121293699241546, + "language_loss": 0.82098341, + "learning_rate": 0.0006060473968273688, + "loss": 0.83178687, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.29882812, + "step": 2334, + "time_per_iteration": 2.687427043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050724, + "balance_loss_mlp": 1.03756309, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.03308553204338399, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78930265, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.13183594, + "step": 2335, + "time_per_iteration": 4.873494625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_mlp": 1.01476717, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.020404135430742085, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82032573, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.12597656, + "step": 2336, + "time_per_iteration": 4.8493242263793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091959, + "balance_loss_mlp": 1.06327689, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.08823378464345366, + "language_loss": 0.8815735, + "learning_rate": 0.0006051338487650047, + "loss": 0.89249313, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.28686523, + "step": 2337, + "time_per_iteration": 2.4994585514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094323, + "balance_loss_mlp": 1.06595135, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058014135330130424, + "language_loss": 0.82146972, + "learning_rate": 0.0006048292509534095, + "loss": 0.83241296, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.28344727, + "step": 2338, + "time_per_iteration": 2.6184592247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_mlp": 1.07211113, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.056454767026620875, + "language_loss": 0.77617335, + "learning_rate": 0.0006045246124434895, + "loss": 0.78716958, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.27539062, + "step": 2339, + "time_per_iteration": 2.7225115299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_mlp": 1.07309031, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.09896135571333878, + "language_loss": 0.86173731, + "learning_rate": 0.0006042199333535162, + "loss": 0.87274528, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.27709961, + "step": 2340, + "time_per_iteration": 3.274585008621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104864, + "balance_loss_mlp": 1.07768369, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05749680267159243, + "language_loss": 0.84251344, + "learning_rate": 0.0006039152138017763, + "loss": 0.85356206, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.27246094, + "step": 2341, + "time_per_iteration": 3.060763359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07796395, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.056134576893582644, + "language_loss": 0.83558077, + "learning_rate": 0.0006036104539065726, + "loss": 0.84663171, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.27172852, + "step": 2342, + "time_per_iteration": 2.7406816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108201, + "balance_loss_mlp": 1.08054459, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.061859527889038764, + "language_loss": 0.84472108, + "learning_rate": 0.000603305653786223, + "loss": 0.85580313, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.27685547, + "step": 2343, + "time_per_iteration": 3.197312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_mlp": 1.07354283, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.054371913691722666, + "language_loss": 0.83979696, + "learning_rate": 0.0006030008135590622, + "loss": 0.85080612, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.27416992, + "step": 2344, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097762, + "balance_loss_mlp": 1.07062995, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.05301123134364682, + "language_loss": 0.8020395, + "learning_rate": 0.0006026959333434387, + "loss": 0.81301707, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.27172852, + "step": 2345, + "time_per_iteration": 2.7582781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.0720278, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.056237590740745906, + "language_loss": 0.77273649, + "learning_rate": 0.0006023910132577181, + "loss": 0.78373116, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.2746582, + "step": 2346, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086046, + "balance_loss_mlp": 1.05915189, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.061957652789735564, + "language_loss": 0.84835315, + "learning_rate": 0.0006020860534202806, + "loss": 0.85921359, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.26953125, + "step": 2347, + "time_per_iteration": 2.5046098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010926, + "balance_loss_mlp": 1.06475294, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.05205934628014934, + "language_loss": 0.80817962, + "learning_rate": 0.0006017810539495224, + "loss": 0.81910563, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.27905273, + "step": 2348, + "time_per_iteration": 2.9269816875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.06642056, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.0701488599790333, + "language_loss": 0.82789373, + "learning_rate": 0.0006014760149638547, + "loss": 0.83883661, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.27880859, + "step": 2349, + "time_per_iteration": 2.725395441055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.05837011, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.05676126010630497, + "language_loss": 0.88258755, + "learning_rate": 0.000601170936581704, + "loss": 0.89344376, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.27270508, + "step": 2350, + "time_per_iteration": 2.5604915618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088839, + "balance_loss_mlp": 1.06101537, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.07551987134141444, + "language_loss": 0.84626472, + "learning_rate": 0.0006008658189215121, + "loss": 0.85715318, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.27832031, + "step": 2351, + "time_per_iteration": 2.6299045085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_mlp": 1.07158601, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.07553479525673996, + "language_loss": 0.79898262, + "learning_rate": 0.0006005606621017366, + "loss": 0.80998385, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.28540039, + "step": 2352, + "time_per_iteration": 2.58725905418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.06732249, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05769795994016392, + "language_loss": 0.8022939, + "learning_rate": 0.0006002554662408496, + "loss": 0.81325346, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.28637695, + "step": 2353, + "time_per_iteration": 2.9054527282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089231, + "balance_loss_mlp": 1.06078792, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.07238968138349489, + "language_loss": 0.91292691, + "learning_rate": 0.0005999502314573388, + "loss": 0.92381918, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.28393555, + "step": 2354, + "time_per_iteration": 2.6389734745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05656958, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.0719451372015111, + "language_loss": 0.86045247, + "learning_rate": 0.0005996449578697066, + "loss": 0.87130976, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.29174805, + "step": 2355, + "time_per_iteration": 2.6851072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_mlp": 1.06634867, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05612545408526447, + "language_loss": 0.81111002, + "learning_rate": 0.0005993396455964709, + "loss": 0.82205319, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.2800293, + "step": 2356, + "time_per_iteration": 2.6760780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095343, + "balance_loss_mlp": 1.06754375, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.05702970789361519, + "language_loss": 0.81782162, + "learning_rate": 0.0005990342947561647, + "loss": 0.82877505, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.27856445, + "step": 2357, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108513, + "balance_loss_mlp": 1.07949746, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.06168719534303639, + "language_loss": 0.77822679, + "learning_rate": 0.0005987289054673351, + "loss": 0.78931195, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.28979492, + "step": 2358, + "time_per_iteration": 2.6254196166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191784, + "balance_loss_mlp": 1.18038785, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.06020491976481073, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77767521, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11376953, + "step": 2359, + "time_per_iteration": 4.803730010986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112502, + "balance_loss_mlp": 1.08300948, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.06904936924963041, + "language_loss": 0.90802431, + "learning_rate": 0.0005981180120183722, + "loss": 0.91914928, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.29443359, + "step": 2360, + "time_per_iteration": 2.672501564025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_mlp": 1.08560812, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.18994365983189826, + "language_loss": 0.85107553, + "learning_rate": 0.0005978125080954089, + "loss": 0.86222672, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.29492188, + "step": 2361, + "time_per_iteration": 2.7426631450653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111841, + "balance_loss_mlp": 1.0814904, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.07946717837388541, + "language_loss": 0.76933616, + "learning_rate": 0.000597506966198262, + "loss": 0.78045452, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.30297852, + "step": 2362, + "time_per_iteration": 2.9498252868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113617, + "balance_loss_mlp": 1.08438706, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.08220053414262748, + "language_loss": 0.83964276, + "learning_rate": 0.0005972013864455536, + "loss": 0.85077894, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.29199219, + "step": 2363, + "time_per_iteration": 2.623084545135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.0844152, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.07689777421943021, + "language_loss": 0.84891784, + "learning_rate": 0.0005968957689559203, + "loss": 0.86004549, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.28369141, + "step": 2364, + "time_per_iteration": 4.15172266960144 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_mlp": 1.07492638, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.0791653109712497, + "language_loss": 0.88481373, + "learning_rate": 0.0005965901138480131, + "loss": 0.89584458, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.28173828, + "step": 2365, + "time_per_iteration": 2.5800631046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097109, + "balance_loss_mlp": 1.06840384, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.06578783357270249, + "language_loss": 0.87197572, + "learning_rate": 0.0005962844212404982, + "loss": 0.88294685, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.28686523, + "step": 2366, + "time_per_iteration": 2.6940040588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091654, + "balance_loss_mlp": 1.06344962, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.05998271622094208, + "language_loss": 0.86890531, + "learning_rate": 0.0005959786912520558, + "loss": 0.87982178, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.2824707, + "step": 2367, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096727, + "balance_loss_mlp": 1.06854558, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04792571197867491, + "language_loss": 0.83765805, + "learning_rate": 0.0005956729240013806, + "loss": 0.8486253, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.28173828, + "step": 2368, + "time_per_iteration": 2.8546009063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.08035553, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.054790339147135006, + "language_loss": 0.91898453, + "learning_rate": 0.0005953671196071824, + "loss": 0.93007344, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.28540039, + "step": 2369, + "time_per_iteration": 2.7034096717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115288, + "balance_loss_mlp": 1.08767939, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05736115779957956, + "language_loss": 0.79610699, + "learning_rate": 0.0005950612781881846, + "loss": 0.8072598, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.27636719, + "step": 2370, + "time_per_iteration": 2.707674264907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124856, + "balance_loss_mlp": 1.09662771, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.08139155344435882, + "language_loss": 0.75630575, + "learning_rate": 0.0005947553998631259, + "loss": 0.76755428, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.2824707, + "step": 2371, + "time_per_iteration": 2.8811731338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125619, + "balance_loss_mlp": 1.09770048, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.07117752980456016, + "language_loss": 0.79090154, + "learning_rate": 0.000594449484750758, + "loss": 0.80215776, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.27905273, + "step": 2372, + "time_per_iteration": 3.1549901962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08807683, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.061849801440599636, + "language_loss": 0.82697588, + "learning_rate": 0.0005941435329698484, + "loss": 0.83814585, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.2890625, + "step": 2373, + "time_per_iteration": 2.6593072414398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118584, + "balance_loss_mlp": 1.09054554, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.06278217801879041, + "language_loss": 0.83130741, + "learning_rate": 0.0005938375446391778, + "loss": 0.8424933, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.28051758, + "step": 2374, + "time_per_iteration": 2.7434608936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.09563541, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06820583935841042, + "language_loss": 0.89043015, + "learning_rate": 0.0005935315198775415, + "loss": 0.90167212, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.28540039, + "step": 2375, + "time_per_iteration": 2.6057205200195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_mlp": 1.08558059, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.07601718344596131, + "language_loss": 0.87262166, + "learning_rate": 0.0005932254588037486, + "loss": 0.88375497, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.27783203, + "step": 2376, + "time_per_iteration": 2.4881751537323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103499, + "balance_loss_mlp": 1.07462692, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.07182864232109534, + "language_loss": 0.86405516, + "learning_rate": 0.000592919361536623, + "loss": 0.87509012, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.28857422, + "step": 2377, + "time_per_iteration": 2.6453545093536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07376885, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06032083182665244, + "language_loss": 0.88920552, + "learning_rate": 0.0005926132281950017, + "loss": 0.90022385, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.28076172, + "step": 2378, + "time_per_iteration": 2.7356886863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.0672735, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.07556174313152972, + "language_loss": 0.8485238, + "learning_rate": 0.0005923070588977367, + "loss": 0.8594898, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.29248047, + "step": 2379, + "time_per_iteration": 2.812110185623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095202, + "balance_loss_mlp": 1.0665921, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.0597594421207511, + "language_loss": 0.86065739, + "learning_rate": 0.0005920008537636931, + "loss": 0.87160945, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.28613281, + "step": 2380, + "time_per_iteration": 2.8955793380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094751, + "balance_loss_mlp": 1.06518722, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.08202954174104495, + "language_loss": 0.86535549, + "learning_rate": 0.0005916946129117504, + "loss": 0.87630302, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.29516602, + "step": 2381, + "time_per_iteration": 2.8850152492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.05958724, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06022733145419036, + "language_loss": 0.80483937, + "learning_rate": 0.0005913883364608017, + "loss": 0.81573421, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.29833984, + "step": 2382, + "time_per_iteration": 3.0977792739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092347, + "balance_loss_mlp": 1.06225872, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.07912283694355432, + "language_loss": 0.88849449, + "learning_rate": 0.0005910820245297542, + "loss": 0.899418, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.30053711, + "step": 2383, + "time_per_iteration": 2.905977964401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081098, + "balance_loss_mlp": 1.05055714, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06971122212551431, + "language_loss": 0.810808, + "learning_rate": 0.000590775677237529, + "loss": 0.82161897, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.30517578, + "step": 2384, + "time_per_iteration": 2.7233986854553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078055, + "balance_loss_mlp": 1.04810929, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.10145803635005178, + "language_loss": 0.79860461, + "learning_rate": 0.0005904692947030601, + "loss": 0.80938518, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.29882812, + "step": 2385, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04647207, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.08299143875661358, + "language_loss": 0.89372921, + "learning_rate": 0.0005901628770452963, + "loss": 0.90449417, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.29956055, + "step": 2386, + "time_per_iteration": 2.56011700630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_mlp": 1.04586029, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05953614440228025, + "language_loss": 0.87499726, + "learning_rate": 0.000589856424383199, + "loss": 0.88575506, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.29882812, + "step": 2387, + "time_per_iteration": 2.622857093811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.04762435, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.06461384040637212, + "language_loss": 0.8283028, + "learning_rate": 0.000589549936835744, + "loss": 0.83908516, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.30566406, + "step": 2388, + "time_per_iteration": 2.9280176162719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082083, + "balance_loss_mlp": 1.0514698, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.07025219360641571, + "language_loss": 0.79160953, + "learning_rate": 0.0005892434145219202, + "loss": 0.80243033, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.30566406, + "step": 2389, + "time_per_iteration": 2.632772207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081464, + "balance_loss_mlp": 1.050946, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.060348492919292666, + "language_loss": 0.82535923, + "learning_rate": 0.0005889368575607303, + "loss": 0.83617389, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.3046875, + "step": 2390, + "time_per_iteration": 2.815487861633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094579, + "balance_loss_mlp": 1.06358492, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05491617941274289, + "language_loss": 0.78348118, + "learning_rate": 0.00058863026607119, + "loss": 0.79442704, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.30957031, + "step": 2391, + "time_per_iteration": 3.0853166580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0620811, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.05825671270919626, + "language_loss": 0.79661655, + "learning_rate": 0.0005883236401723287, + "loss": 0.80753851, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.30078125, + "step": 2392, + "time_per_iteration": 3.1643104553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096169, + "balance_loss_mlp": 1.06536531, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.06457998167472197, + "language_loss": 0.84046978, + "learning_rate": 0.0005880169799831893, + "loss": 0.85143149, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.30761719, + "step": 2393, + "time_per_iteration": 2.6935391426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096173, + "balance_loss_mlp": 1.0654645, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.06354744392782355, + "language_loss": 0.81838334, + "learning_rate": 0.0005877102856228278, + "loss": 0.82934511, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.30664062, + "step": 2394, + "time_per_iteration": 2.8314805030822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097821, + "balance_loss_mlp": 1.06713629, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0665210460005036, + "language_loss": 0.84696203, + "learning_rate": 0.0005874035572103133, + "loss": 0.8579402, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.30664062, + "step": 2395, + "time_per_iteration": 2.6893725395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098408, + "balance_loss_mlp": 1.0673902, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.1082823786036068, + "language_loss": 0.82554322, + "learning_rate": 0.0005870967948647288, + "loss": 0.83652729, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.30981445, + "step": 2396, + "time_per_iteration": 2.7625200748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191183, + "balance_loss_mlp": 1.1745894, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.05861502253959749, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75499487, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.16601562, + "step": 2397, + "time_per_iteration": 5.363407850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090965, + "balance_loss_mlp": 1.06028056, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.08876233940236913, + "language_loss": 0.85477209, + "learning_rate": 0.0005864831688507443, + "loss": 0.86568171, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.30639648, + "step": 2398, + "time_per_iteration": 2.9619805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081398, + "balance_loss_mlp": 1.05119061, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.06931834879873142, + "language_loss": 0.75342947, + "learning_rate": 0.0005861763054205754, + "loss": 0.76424348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.30151367, + "step": 2399, + "time_per_iteration": 2.7531988620758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091818, + "balance_loss_mlp": 1.06213522, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.05751461156756605, + "language_loss": 0.80467141, + "learning_rate": 0.0005858694085337976, + "loss": 0.81558955, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.29614258, + "step": 2400, + "time_per_iteration": 2.814182758331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083104, + "balance_loss_mlp": 1.05246735, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.07664119673877032, + "language_loss": 0.8354007, + "learning_rate": 0.0005855624783095589, + "loss": 0.8462317, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.30615234, + "step": 2401, + "time_per_iteration": 2.57083797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_mlp": 1.05414128, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.06712435829168825, + "language_loss": 0.85380065, + "learning_rate": 0.00058525551486702, + "loss": 0.864636, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.29370117, + "step": 2402, + "time_per_iteration": 2.554870843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_mlp": 1.05476141, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06447976336023753, + "language_loss": 0.80940902, + "learning_rate": 0.0005849485183253548, + "loss": 0.82025588, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.29882812, + "step": 2403, + "time_per_iteration": 2.6398868560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.05546916, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.07099246909711197, + "language_loss": 0.87546206, + "learning_rate": 0.0005846414888037501, + "loss": 0.88631094, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.29345703, + "step": 2404, + "time_per_iteration": 2.5056095123291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086728, + "balance_loss_mlp": 1.05725932, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.052798237228442416, + "language_loss": 0.82345319, + "learning_rate": 0.0005843344264214049, + "loss": 0.83432049, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.29443359, + "step": 2405, + "time_per_iteration": 2.7549078464508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.06176221, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.05337180485738099, + "language_loss": 0.84920704, + "learning_rate": 0.0005840273312975317, + "loss": 0.8601203, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.29516602, + "step": 2406, + "time_per_iteration": 2.9058027267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085122, + "balance_loss_mlp": 1.05577278, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.05333458165520064, + "language_loss": 0.89626014, + "learning_rate": 0.0005837202035513555, + "loss": 0.90711135, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.29345703, + "step": 2407, + "time_per_iteration": 2.5721802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094311, + "balance_loss_mlp": 1.06531978, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.0552743160267319, + "language_loss": 0.81124538, + "learning_rate": 0.0005834130433021136, + "loss": 0.8221885, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.28930664, + "step": 2408, + "time_per_iteration": 2.7402079105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.06166446, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.09526074365649402, + "language_loss": 0.73246038, + "learning_rate": 0.0005831058506690563, + "loss": 0.74337649, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.29931641, + "step": 2409, + "time_per_iteration": 2.6229617595672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088655, + "balance_loss_mlp": 1.05875707, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.061078353708003665, + "language_loss": 0.85864687, + "learning_rate": 0.0005827986257714464, + "loss": 0.86953342, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.29858398, + "step": 2410, + "time_per_iteration": 2.9352338314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094131, + "balance_loss_mlp": 1.06404257, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.05695764594036898, + "language_loss": 0.88375425, + "learning_rate": 0.0005824913687285591, + "loss": 0.89469558, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.30078125, + "step": 2411, + "time_per_iteration": 2.6807737350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097526, + "balance_loss_mlp": 1.06698477, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.0643729084989199, + "language_loss": 0.81849819, + "learning_rate": 0.0005821840796596821, + "loss": 0.82947344, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.30493164, + "step": 2412, + "time_per_iteration": 2.663177967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096211, + "balance_loss_mlp": 1.0657649, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.07601159389817994, + "language_loss": 0.80307502, + "learning_rate": 0.0005818767586841158, + "loss": 0.81403708, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.30419922, + "step": 2413, + "time_per_iteration": 2.7600111961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092616, + "balance_loss_mlp": 1.06233692, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.059484167412089096, + "language_loss": 0.86110759, + "learning_rate": 0.0005815694059211726, + "loss": 0.87203372, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.30249023, + "step": 2414, + "time_per_iteration": 2.65578031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148176, + "balance_loss_mlp": 1.13263142, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.0462911781552321, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82021809, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.15527344, + "step": 2415, + "time_per_iteration": 4.8046934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_mlp": 1.10092187, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.038481348382240925, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78060573, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.14550781, + "step": 2416, + "time_per_iteration": 4.977246999740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.05554748, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.07046148078843767, + "language_loss": 0.85802382, + "learning_rate": 0.0005806471581013931, + "loss": 0.86888373, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.30395508, + "step": 2417, + "time_per_iteration": 2.7680604457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_mlp": 1.05363095, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.061868019756872866, + "language_loss": 0.78540701, + "learning_rate": 0.0005803396793823146, + "loss": 0.7962473, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.30371094, + "step": 2418, + "time_per_iteration": 2.818821430206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081583, + "balance_loss_mlp": 1.05213845, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.08069009721002836, + "language_loss": 0.8594386, + "learning_rate": 0.0005800321694726065, + "loss": 0.8702544, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.29418945, + "step": 2419, + "time_per_iteration": 2.812563896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_mlp": 1.05454159, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.061646313113324705, + "language_loss": 0.86883628, + "learning_rate": 0.0005797246284916545, + "loss": 0.87968636, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.30444336, + "step": 2420, + "time_per_iteration": 2.6945559978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_mlp": 1.02332675, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.024509703594541715, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78539675, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.11181641, + "step": 2421, + "time_per_iteration": 5.001375436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089527, + "balance_loss_mlp": 1.06036878, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.07023208249232396, + "language_loss": 0.8781141, + "learning_rate": 0.0005791094537936233, + "loss": 0.88900936, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.29150391, + "step": 2422, + "time_per_iteration": 2.703678846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010888, + "balance_loss_mlp": 1.06028509, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.06283657209164231, + "language_loss": 0.817285, + "learning_rate": 0.0005788018203153762, + "loss": 0.82817304, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.28515625, + "step": 2423, + "time_per_iteration": 2.6398653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081237, + "balance_loss_mlp": 1.05255485, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.0646507393923986, + "language_loss": 0.85720015, + "learning_rate": 0.000578494156243549, + "loss": 0.86801249, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.28686523, + "step": 2424, + "time_per_iteration": 2.6061441898345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086736, + "balance_loss_mlp": 1.05695724, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.05149395612804314, + "language_loss": 0.89174867, + "learning_rate": 0.0005781864616975878, + "loss": 0.90261602, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.29736328, + "step": 2425, + "time_per_iteration": 2.7073817253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05917215, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.0742004751674347, + "language_loss": 0.84101117, + "learning_rate": 0.0005778787367969502, + "loss": 0.85188806, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.28515625, + "step": 2426, + "time_per_iteration": 2.643342971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082589, + "balance_loss_mlp": 1.05374038, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.05195761556147334, + "language_loss": 0.80815637, + "learning_rate": 0.0005775709816611053, + "loss": 0.81898224, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.28857422, + "step": 2427, + "time_per_iteration": 3.0103423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085111, + "balance_loss_mlp": 1.05604792, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05192902090033842, + "language_loss": 0.83742678, + "learning_rate": 0.0005772631964095346, + "loss": 0.84827781, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.29003906, + "step": 2428, + "time_per_iteration": 4.2191994190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010894, + "balance_loss_mlp": 1.06107569, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.05894584384100732, + "language_loss": 0.85613596, + "learning_rate": 0.000576955381161731, + "loss": 0.86702996, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.28320312, + "step": 2429, + "time_per_iteration": 2.7035927772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.05297327, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.07711305585297333, + "language_loss": 0.8606714, + "learning_rate": 0.0005766475360371985, + "loss": 0.87149525, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.29394531, + "step": 2430, + "time_per_iteration": 2.5702948570251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092231, + "balance_loss_mlp": 1.06292963, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.08342834969675962, + "language_loss": 0.84959614, + "learning_rate": 0.0005763396611554536, + "loss": 0.86051846, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.29248047, + "step": 2431, + "time_per_iteration": 2.6236841678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092277, + "balance_loss_mlp": 1.06383383, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.06223220956170435, + "language_loss": 0.80269897, + "learning_rate": 0.0005760317566360237, + "loss": 0.81362176, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.28466797, + "step": 2432, + "time_per_iteration": 3.0205023288726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_mlp": 1.0559535, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.058294757950733474, + "language_loss": 0.85130137, + "learning_rate": 0.000575723822598448, + "loss": 0.86214417, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.28295898, + "step": 2433, + "time_per_iteration": 2.79516339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086726, + "balance_loss_mlp": 1.05866385, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.06256497191901454, + "language_loss": 0.81601393, + "learning_rate": 0.0005754158591622773, + "loss": 0.82688123, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.28076172, + "step": 2434, + "time_per_iteration": 2.963247537612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092504, + "balance_loss_mlp": 1.06365538, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.08333045297400817, + "language_loss": 0.8228929, + "learning_rate": 0.0005751078664470732, + "loss": 0.83381796, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.28833008, + "step": 2435, + "time_per_iteration": 2.537179470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_mlp": 1.05688024, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.08080859282065189, + "language_loss": 0.85670036, + "learning_rate": 0.0005747998445724094, + "loss": 0.86755049, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.28125, + "step": 2436, + "time_per_iteration": 2.6276183128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083485, + "balance_loss_mlp": 1.05466008, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.08810611044699188, + "language_loss": 0.89099967, + "learning_rate": 0.0005744917936578707, + "loss": 0.90183449, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.28808594, + "step": 2437, + "time_per_iteration": 2.784236431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085755, + "balance_loss_mlp": 1.05690634, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.08777270325229546, + "language_loss": 0.83928555, + "learning_rate": 0.0005741837138230526, + "loss": 0.85014307, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.28808594, + "step": 2438, + "time_per_iteration": 2.7139840126037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078469, + "balance_loss_mlp": 1.05014467, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.053438427497709357, + "language_loss": 0.86270201, + "learning_rate": 0.0005738756051875627, + "loss": 0.87348676, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.28295898, + "step": 2439, + "time_per_iteration": 3.092337131500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074485, + "balance_loss_mlp": 1.04551697, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.056335724754341315, + "language_loss": 0.83459938, + "learning_rate": 0.0005735674678710192, + "loss": 0.84534419, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.28930664, + "step": 2440, + "time_per_iteration": 2.6729819774627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107755, + "balance_loss_mlp": 1.0473665, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.06862136292067082, + "language_loss": 0.80992246, + "learning_rate": 0.0005732593019930517, + "loss": 0.82069802, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.30126953, + "step": 2441, + "time_per_iteration": 2.917332649230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078244, + "balance_loss_mlp": 1.04779828, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.06788307957029095, + "language_loss": 0.8767302, + "learning_rate": 0.0005729511076733008, + "loss": 0.88751262, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.30395508, + "step": 2442, + "time_per_iteration": 2.6602578163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108041, + "balance_loss_mlp": 1.05003536, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.08414136163770505, + "language_loss": 0.84802854, + "learning_rate": 0.000572642885031418, + "loss": 0.85883266, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.30322266, + "step": 2443, + "time_per_iteration": 2.924572706222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075591, + "balance_loss_mlp": 1.04516852, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.055800438037163856, + "language_loss": 0.80518812, + "learning_rate": 0.0005723346341870662, + "loss": 0.81594402, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.30371094, + "step": 2444, + "time_per_iteration": 2.7203280925750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05217505, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.06929087535104682, + "language_loss": 0.86297798, + "learning_rate": 0.0005720263552599188, + "loss": 0.87380457, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.30444336, + "step": 2445, + "time_per_iteration": 2.469621419906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075882, + "balance_loss_mlp": 1.0456984, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.06843850090218344, + "language_loss": 0.79142129, + "learning_rate": 0.0005717180483696604, + "loss": 0.80218005, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.30151367, + "step": 2446, + "time_per_iteration": 2.9089763164520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072219, + "balance_loss_mlp": 1.04034209, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.07381367232784701, + "language_loss": 0.83118802, + "learning_rate": 0.0005714097136359862, + "loss": 0.84191024, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.31860352, + "step": 2447, + "time_per_iteration": 2.6346585750579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04817808, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.06979677359463858, + "language_loss": 0.86324209, + "learning_rate": 0.0005711013511786027, + "loss": 0.87403476, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.31054688, + "step": 2448, + "time_per_iteration": 2.765740156173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073046, + "balance_loss_mlp": 1.0426712, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.048536468835106476, + "language_loss": 0.84014428, + "learning_rate": 0.0005707929611172263, + "loss": 0.85087478, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3034668, + "step": 2449, + "time_per_iteration": 2.6891775131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074493, + "balance_loss_mlp": 1.04349887, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05569215031080998, + "language_loss": 0.83788037, + "learning_rate": 0.000570484543571585, + "loss": 0.84862536, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.30957031, + "step": 2450, + "time_per_iteration": 2.545646905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076975, + "balance_loss_mlp": 1.04743469, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.06210999897734131, + "language_loss": 0.82771122, + "learning_rate": 0.0005701760986614171, + "loss": 0.83848095, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.29492188, + "step": 2451, + "time_per_iteration": 2.5739784240722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080958, + "balance_loss_mlp": 1.05256283, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.06034093462601522, + "language_loss": 0.87343812, + "learning_rate": 0.0005698676265064714, + "loss": 0.88424772, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.28393555, + "step": 2452, + "time_per_iteration": 2.5456669330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_mlp": 1.05612302, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.12010658803535784, + "language_loss": 0.88854802, + "learning_rate": 0.0005695591272265074, + "loss": 0.89940351, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.29370117, + "step": 2453, + "time_per_iteration": 2.53247332572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.05610394, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.06319040539886057, + "language_loss": 0.81670743, + "learning_rate": 0.0005692506009412954, + "loss": 0.8275677, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.29907227, + "step": 2454, + "time_per_iteration": 2.663959503173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157874, + "balance_loss_mlp": 1.14423668, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.046124065416459865, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78709137, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.13671875, + "step": 2455, + "time_per_iteration": 4.937524795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085858, + "balance_loss_mlp": 1.05603182, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.07174058927835297, + "language_loss": 0.89622641, + "learning_rate": 0.0005686334678342593, + "loss": 0.907085, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.2980957, + "step": 2456, + "time_per_iteration": 2.9060487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077496, + "balance_loss_mlp": 1.04824257, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.07069871267474889, + "language_loss": 0.81667411, + "learning_rate": 0.0005683248612520274, + "loss": 0.82744908, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.29223633, + "step": 2457, + "time_per_iteration": 3.071544885635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_mlp": 1.05465865, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.07071545002601118, + "language_loss": 0.83683658, + "learning_rate": 0.0005680162281437321, + "loss": 0.84768021, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.296875, + "step": 2458, + "time_per_iteration": 2.931579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077685, + "balance_loss_mlp": 1.0476439, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.06018673388195985, + "language_loss": 0.84837544, + "learning_rate": 0.000567707568629195, + "loss": 0.85915226, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.30004883, + "step": 2459, + "time_per_iteration": 2.6860852241516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079226, + "balance_loss_mlp": 1.04968619, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.053752412093893094, + "language_loss": 0.82513988, + "learning_rate": 0.0005673988828282486, + "loss": 0.83593214, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.29467773, + "step": 2460, + "time_per_iteration": 2.6679980754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.04320669, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.05735836881189746, + "language_loss": 0.80829632, + "learning_rate": 0.0005670901708607352, + "loss": 0.81903076, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.30175781, + "step": 2461, + "time_per_iteration": 2.962364673614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076898, + "balance_loss_mlp": 1.04635668, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06660215000338995, + "language_loss": 0.84026098, + "learning_rate": 0.0005667814328465076, + "loss": 0.85102999, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.30493164, + "step": 2462, + "time_per_iteration": 2.6148030757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077856, + "balance_loss_mlp": 1.04824424, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.0820641824195461, + "language_loss": 0.81702316, + "learning_rate": 0.0005664726689054285, + "loss": 0.8278017, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.29541016, + "step": 2463, + "time_per_iteration": 2.46337628364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.04910851, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.07270387927239072, + "language_loss": 0.81341946, + "learning_rate": 0.0005661638791573704, + "loss": 0.82421935, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.30859375, + "step": 2464, + "time_per_iteration": 2.712188720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_mlp": 1.05453193, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.05714322793938323, + "language_loss": 0.87222457, + "learning_rate": 0.0005658550637222164, + "loss": 0.88307238, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.30224609, + "step": 2465, + "time_per_iteration": 2.63380765914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082927, + "balance_loss_mlp": 1.05298185, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.06339144108901118, + "language_loss": 0.82493532, + "learning_rate": 0.0005655462227198592, + "loss": 0.83576465, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.29907227, + "step": 2466, + "time_per_iteration": 2.910783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_mlp": 1.0547595, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05460968765214119, + "language_loss": 0.83975738, + "learning_rate": 0.0005652373562702016, + "loss": 0.85060585, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.30053711, + "step": 2467, + "time_per_iteration": 2.6101505756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.05072081, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.06618054462006194, + "language_loss": 0.88145614, + "learning_rate": 0.000564928464493156, + "loss": 0.89226621, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.30249023, + "step": 2468, + "time_per_iteration": 2.55812668800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081635, + "balance_loss_mlp": 1.05247641, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.06741069565287812, + "language_loss": 0.81633413, + "learning_rate": 0.000564619547508645, + "loss": 0.82715052, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.29150391, + "step": 2469, + "time_per_iteration": 3.1341404914855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082878, + "balance_loss_mlp": 1.05252695, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.0651779420020333, + "language_loss": 0.83088791, + "learning_rate": 0.0005643106054366008, + "loss": 0.84171665, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.30297852, + "step": 2470, + "time_per_iteration": 2.610891342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.04666018, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.0714119485898344, + "language_loss": 0.79053152, + "learning_rate": 0.000564001638396965, + "loss": 0.80129188, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.29321289, + "step": 2471, + "time_per_iteration": 2.7754971981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083604, + "balance_loss_mlp": 1.05430186, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05565021284268994, + "language_loss": 0.8203246, + "learning_rate": 0.0005636926465096897, + "loss": 0.83116066, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.29248047, + "step": 2472, + "time_per_iteration": 3.028235912322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079414, + "balance_loss_mlp": 1.05116105, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.06838176056824781, + "language_loss": 0.87627274, + "learning_rate": 0.0005633836298947363, + "loss": 0.8870669, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.28271484, + "step": 2473, + "time_per_iteration": 2.609142303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04901338, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.06111056533479294, + "language_loss": 0.70809621, + "learning_rate": 0.000563074588672075, + "loss": 0.71887386, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.28759766, + "step": 2474, + "time_per_iteration": 2.722593069076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_mlp": 1.05080247, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.06296236889432077, + "language_loss": 0.85321903, + "learning_rate": 0.0005627655229616868, + "loss": 0.8640129, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.28540039, + "step": 2475, + "time_per_iteration": 2.711296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081174, + "balance_loss_mlp": 1.05141973, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.06122384611792148, + "language_loss": 0.89890903, + "learning_rate": 0.0005624564328835616, + "loss": 0.90972078, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.29736328, + "step": 2476, + "time_per_iteration": 2.796614408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05069184, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.05962569805242902, + "language_loss": 0.84079456, + "learning_rate": 0.0005621473185576986, + "loss": 0.85158479, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.28344727, + "step": 2477, + "time_per_iteration": 2.7140815258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.05709434, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.07093607725441804, + "language_loss": 0.87060082, + "learning_rate": 0.0005618381801041068, + "loss": 0.88146281, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.29077148, + "step": 2478, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085469, + "balance_loss_mlp": 1.05638218, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.07057707739429774, + "language_loss": 0.83022285, + "learning_rate": 0.0005615290176428044, + "loss": 0.84107757, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.29052734, + "step": 2479, + "time_per_iteration": 2.6407430171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108759, + "balance_loss_mlp": 1.05828834, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06449831218896054, + "language_loss": 0.85197705, + "learning_rate": 0.0005612198312938187, + "loss": 0.86285299, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.29296875, + "step": 2480, + "time_per_iteration": 2.7345011234283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108973, + "balance_loss_mlp": 1.06121504, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.060218704260060575, + "language_loss": 0.79185855, + "learning_rate": 0.0005609106211771868, + "loss": 0.80275583, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.28540039, + "step": 2481, + "time_per_iteration": 2.8754329681396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.05908394, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07327776648741448, + "language_loss": 0.89180911, + "learning_rate": 0.0005606013874129543, + "loss": 0.90269172, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.29199219, + "step": 2482, + "time_per_iteration": 2.7726404666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090058, + "balance_loss_mlp": 1.06049454, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.06456332848164101, + "language_loss": 0.79976207, + "learning_rate": 0.0005602921301211768, + "loss": 0.81066263, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.29516602, + "step": 2483, + "time_per_iteration": 2.715306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089436, + "balance_loss_mlp": 1.06132603, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07998801300028703, + "language_loss": 0.82180744, + "learning_rate": 0.0005599828494219185, + "loss": 0.83270174, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.28100586, + "step": 2484, + "time_per_iteration": 2.5683019161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086424, + "balance_loss_mlp": 1.05836201, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.06543459725570545, + "language_loss": 0.88914174, + "learning_rate": 0.0005596735454352527, + "loss": 0.90000606, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.28076172, + "step": 2485, + "time_per_iteration": 2.8615424633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083119, + "balance_loss_mlp": 1.05531943, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07228586186756063, + "language_loss": 0.85170126, + "learning_rate": 0.0005593642182812619, + "loss": 0.8625325, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.27856445, + "step": 2486, + "time_per_iteration": 2.6507115364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_mlp": 1.0574224, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.06671866930909515, + "language_loss": 0.83972216, + "learning_rate": 0.0005590548680800378, + "loss": 0.85056645, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.27050781, + "step": 2487, + "time_per_iteration": 3.0963587760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085422, + "balance_loss_mlp": 1.05755091, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.0627787894989405, + "language_loss": 0.7639966, + "learning_rate": 0.0005587454949516804, + "loss": 0.77485085, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.27880859, + "step": 2488, + "time_per_iteration": 2.704761266708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085753, + "balance_loss_mlp": 1.05719018, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.07191070894190046, + "language_loss": 0.87996674, + "learning_rate": 0.0005584360990162993, + "loss": 0.89082426, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.28540039, + "step": 2489, + "time_per_iteration": 2.68680477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_mlp": 1.05921531, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.052754850289178916, + "language_loss": 0.85114515, + "learning_rate": 0.0005581266803940124, + "loss": 0.86201936, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.28222656, + "step": 2490, + "time_per_iteration": 2.7187392711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.06322539, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.061347112520969346, + "language_loss": 0.87164974, + "learning_rate": 0.0005578172392049471, + "loss": 0.8825624, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.28051758, + "step": 2491, + "time_per_iteration": 2.7291457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089047, + "balance_loss_mlp": 1.06048441, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.07263845202824909, + "language_loss": 0.84244549, + "learning_rate": 0.0005575077755692386, + "loss": 0.85333598, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.28564453, + "step": 2492, + "time_per_iteration": 2.8026599884033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080078, + "balance_loss_mlp": 1.05246925, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0504022340685432, + "language_loss": 0.85800493, + "learning_rate": 0.0005571982896070316, + "loss": 0.86880577, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.27612305, + "step": 2493, + "time_per_iteration": 2.655550003051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080752, + "balance_loss_mlp": 1.05266619, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.11668407926682704, + "language_loss": 0.89753431, + "learning_rate": 0.0005568887814384792, + "loss": 0.90834183, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.28100586, + "step": 2494, + "time_per_iteration": 2.5966434478759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080843, + "balance_loss_mlp": 1.05337763, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.058142169565221447, + "language_loss": 0.87224984, + "learning_rate": 0.000556579251183743, + "loss": 0.88305831, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.27490234, + "step": 2495, + "time_per_iteration": 2.6536028385162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080101, + "balance_loss_mlp": 1.05089474, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06356237967295801, + "language_loss": 0.7994827, + "learning_rate": 0.0005562696989629936, + "loss": 0.81028366, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.29174805, + "step": 2496, + "time_per_iteration": 2.691530466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_mlp": 1.05328333, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.07544069195311896, + "language_loss": 0.82662058, + "learning_rate": 0.0005559601248964095, + "loss": 0.83744615, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.29223633, + "step": 2497, + "time_per_iteration": 2.687108278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078067, + "balance_loss_mlp": 1.04931426, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.07160134617119021, + "language_loss": 0.85915172, + "learning_rate": 0.0005556505291041783, + "loss": 0.86993241, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.28735352, + "step": 2498, + "time_per_iteration": 2.7002923488616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.05264211, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.21407023754506424, + "language_loss": 0.84214193, + "learning_rate": 0.0005553409117064954, + "loss": 0.85295641, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.2878418, + "step": 2499, + "time_per_iteration": 2.877713203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096264, + "balance_loss_mlp": 1.06824946, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.06103635462331165, + "language_loss": 0.84855151, + "learning_rate": 0.0005550312728235654, + "loss": 0.85951412, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.28051758, + "step": 2500, + "time_per_iteration": 2.716524362564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094238, + "balance_loss_mlp": 1.06610465, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.07633647670380422, + "language_loss": 0.83599609, + "learning_rate": 0.0005547216125756003, + "loss": 0.84693843, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.28125, + "step": 2501, + "time_per_iteration": 2.8102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097276, + "balance_loss_mlp": 1.06899917, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.05816521463755192, + "language_loss": 0.81801546, + "learning_rate": 0.0005544119310828211, + "loss": 0.82898819, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.28295898, + "step": 2502, + "time_per_iteration": 3.09083890914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110256, + "balance_loss_mlp": 1.08162141, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.07468975257849066, + "language_loss": 0.84463918, + "learning_rate": 0.0005541022284654568, + "loss": 0.85574174, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.28613281, + "step": 2503, + "time_per_iteration": 2.959812641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105243, + "balance_loss_mlp": 1.07613182, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.06287004960739773, + "language_loss": 0.83878344, + "learning_rate": 0.0005537925048437446, + "loss": 0.84983587, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.29077148, + "step": 2504, + "time_per_iteration": 2.5965919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113897, + "balance_loss_mlp": 1.12542796, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.039351692623908835, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76890433, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.13574219, + "step": 2505, + "time_per_iteration": 4.965132713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112409, + "balance_loss_mlp": 1.08420432, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.06703534425937603, + "language_loss": 0.88412756, + "learning_rate": 0.0005531729950682664, + "loss": 0.89525163, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.28198242, + "step": 2506, + "time_per_iteration": 3.032463550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107907, + "balance_loss_mlp": 1.07936859, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.08139997578259908, + "language_loss": 0.84598732, + "learning_rate": 0.000552863209155015, + "loss": 0.85706639, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.28564453, + "step": 2507, + "time_per_iteration": 2.501650333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101488, + "balance_loss_mlp": 1.07285357, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.06119014713123412, + "language_loss": 0.81909472, + "learning_rate": 0.0005525534027184461, + "loss": 0.83010966, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.28637695, + "step": 2508, + "time_per_iteration": 2.5787370204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098365, + "balance_loss_mlp": 1.06942117, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.05313984540081721, + "language_loss": 0.82654703, + "learning_rate": 0.0005522435758788365, + "loss": 0.83753073, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.28930664, + "step": 2509, + "time_per_iteration": 2.7109761238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010953, + "balance_loss_mlp": 1.06730938, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.05877851050813853, + "language_loss": 0.80259538, + "learning_rate": 0.0005519337287564721, + "loss": 0.81354833, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.2800293, + "step": 2510, + "time_per_iteration": 2.8329310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109601, + "balance_loss_mlp": 1.06759048, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.060327319620096846, + "language_loss": 0.83688086, + "learning_rate": 0.000551623861471646, + "loss": 0.84784102, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.28417969, + "step": 2511, + "time_per_iteration": 2.7470946311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_mlp": 1.08784056, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.03397215547055983, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79919541, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.12890625, + "step": 2512, + "time_per_iteration": 4.837340593338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095094, + "balance_loss_mlp": 1.06619751, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.059215268588021376, + "language_loss": 0.86540532, + "learning_rate": 0.0005510040668958211, + "loss": 0.87635624, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.2890625, + "step": 2513, + "time_per_iteration": 2.5706045627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.06364644, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.0265804362292035, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78836721, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.12451172, + "step": 2514, + "time_per_iteration": 4.899883508682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.0589062, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05909251781800444, + "language_loss": 0.83435559, + "learning_rate": 0.0005503841931138645, + "loss": 0.84523714, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.29272461, + "step": 2515, + "time_per_iteration": 2.665804386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.06112456, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.06787127022085944, + "language_loss": 0.81963372, + "learning_rate": 0.0005500742268214025, + "loss": 0.8305335, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.28833008, + "step": 2516, + "time_per_iteration": 2.5123801231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.05487967, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.05799188255481874, + "language_loss": 0.85305762, + "learning_rate": 0.0005497642410884014, + "loss": 0.86390138, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.29492188, + "step": 2517, + "time_per_iteration": 2.818969249725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107799, + "balance_loss_mlp": 1.04907012, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.0575391439282783, + "language_loss": 0.85093868, + "learning_rate": 0.0005494542360352085, + "loss": 0.8617186, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.28881836, + "step": 2518, + "time_per_iteration": 2.654691457748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081359, + "balance_loss_mlp": 1.05220056, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.06803778984218942, + "language_loss": 0.85824656, + "learning_rate": 0.0005491442117821783, + "loss": 0.86906004, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.29125977, + "step": 2519, + "time_per_iteration": 2.703547954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.0510273, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.12066852374350216, + "language_loss": 0.87487119, + "learning_rate": 0.0005488341684496732, + "loss": 0.88568664, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.3046875, + "step": 2520, + "time_per_iteration": 2.6539435386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107692, + "balance_loss_mlp": 1.04757047, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.05745701253476237, + "language_loss": 0.91846752, + "learning_rate": 0.0005485241061580624, + "loss": 0.92923677, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.29296875, + "step": 2521, + "time_per_iteration": 2.775069236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_mlp": 1.04995275, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05822253141450555, + "language_loss": 0.84573066, + "learning_rate": 0.0005482140250277228, + "loss": 0.8565352, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.3046875, + "step": 2522, + "time_per_iteration": 2.9740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_mlp": 1.05306387, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.06368999588379491, + "language_loss": 0.87678063, + "learning_rate": 0.0005479039251790387, + "loss": 0.88760674, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.29492188, + "step": 2523, + "time_per_iteration": 2.6360013484954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.05666256, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.060153636482772124, + "language_loss": 0.84925246, + "learning_rate": 0.0005475938067324014, + "loss": 0.8601191, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.29956055, + "step": 2524, + "time_per_iteration": 2.8053042888641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05542803, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.059684937302366806, + "language_loss": 0.83693206, + "learning_rate": 0.0005472836698082098, + "loss": 0.84777892, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.29199219, + "step": 2525, + "time_per_iteration": 2.513991355895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_mlp": 1.05587339, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.059033754749834536, + "language_loss": 0.84245414, + "learning_rate": 0.0005469735145268694, + "loss": 0.85330468, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.29174805, + "step": 2526, + "time_per_iteration": 2.758964776992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085929, + "balance_loss_mlp": 1.05712819, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.05692033512559974, + "language_loss": 0.80668163, + "learning_rate": 0.0005466633410087933, + "loss": 0.81754094, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.28808594, + "step": 2527, + "time_per_iteration": 2.7483773231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_mlp": 1.01712215, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.02025241925229164, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78289819, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11865234, + "step": 2528, + "time_per_iteration": 4.8671183586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084286, + "balance_loss_mlp": 1.05558062, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.060917910127877034, + "language_loss": 0.88050807, + "learning_rate": 0.0005460429397441214, + "loss": 0.89135092, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.28662109, + "step": 2529, + "time_per_iteration": 2.5488078594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.05416238, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06933582049293556, + "language_loss": 0.86551011, + "learning_rate": 0.0005457327122383866, + "loss": 0.87634516, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.29321289, + "step": 2530, + "time_per_iteration": 2.6199238300323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018983, + "balance_loss_mlp": 1.00711012, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.01657901033031013, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75655472, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.11865234, + "step": 2531, + "time_per_iteration": 4.810813665390015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.05754662, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.0731565805542322, + "language_loss": 0.75476754, + "learning_rate": 0.0005451122040823244, + "loss": 0.76563311, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.28979492, + "step": 2532, + "time_per_iteration": 2.7834720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0543766, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05844807259880667, + "language_loss": 0.7683785, + "learning_rate": 0.0005448019236728997, + "loss": 0.77921844, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.29589844, + "step": 2533, + "time_per_iteration": 2.9007680416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108612, + "balance_loss_mlp": 1.05789077, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.06352012335970622, + "language_loss": 0.84519851, + "learning_rate": 0.0005444916258698255, + "loss": 0.85605973, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.2824707, + "step": 2534, + "time_per_iteration": 2.6479434967041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083901, + "balance_loss_mlp": 1.05450428, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.06527387606118956, + "language_loss": 0.85987055, + "learning_rate": 0.0005441813107935704, + "loss": 0.8707096, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.29370117, + "step": 2535, + "time_per_iteration": 2.657701253890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.05359387, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05960574003717953, + "language_loss": 0.85425317, + "learning_rate": 0.0005438709785646091, + "loss": 0.86507541, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.28637695, + "step": 2536, + "time_per_iteration": 2.5686872005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081582, + "balance_loss_mlp": 1.05197084, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.0674154398441342, + "language_loss": 0.86857444, + "learning_rate": 0.0005435606293034234, + "loss": 0.87939024, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.29589844, + "step": 2537, + "time_per_iteration": 2.6792654991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108176, + "balance_loss_mlp": 1.05334091, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.1079718501079392, + "language_loss": 0.85096419, + "learning_rate": 0.0005432502631305016, + "loss": 0.86178184, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.28417969, + "step": 2538, + "time_per_iteration": 2.6790173053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082462, + "balance_loss_mlp": 1.05366075, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.270667674808598, + "language_loss": 0.83102262, + "learning_rate": 0.0005429398801663386, + "loss": 0.84184724, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.28808594, + "step": 2539, + "time_per_iteration": 2.9468812942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074127, + "balance_loss_mlp": 1.04453969, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.06499376102514318, + "language_loss": 0.82999051, + "learning_rate": 0.0005426294805314355, + "loss": 0.8407318, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.29541016, + "step": 2540, + "time_per_iteration": 4.142840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.04685867, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.055782244803189183, + "language_loss": 0.80130786, + "learning_rate": 0.0005423190643463003, + "loss": 0.81207728, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.30053711, + "step": 2541, + "time_per_iteration": 2.972822427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04237723, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.07101662394817357, + "language_loss": 0.83088171, + "learning_rate": 0.0005420086317314473, + "loss": 0.84160542, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.29956055, + "step": 2542, + "time_per_iteration": 2.651425838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.04180098, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.06479627692425034, + "language_loss": 0.81022084, + "learning_rate": 0.0005416981828073971, + "loss": 0.82094878, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.30957031, + "step": 2543, + "time_per_iteration": 2.775273323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111363, + "balance_loss_mlp": 1.09922981, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.045109342737372694, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78228641, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.14355469, + "step": 2544, + "time_per_iteration": 4.819438219070435 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.0383091, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.07868028775989613, + "language_loss": 0.85065794, + "learning_rate": 0.000541077236513819, + "loss": 0.86135024, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.30883789, + "step": 2545, + "time_per_iteration": 2.5191094875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.03981793, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.07130550478628667, + "language_loss": 0.82089663, + "learning_rate": 0.0005407667393853638, + "loss": 0.83161378, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31884766, + "step": 2546, + "time_per_iteration": 2.617934465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107245, + "balance_loss_mlp": 1.04043055, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.07826700951116618, + "language_loss": 0.8301416, + "learning_rate": 0.0005404562264298569, + "loss": 0.84086609, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32006836, + "step": 2547, + "time_per_iteration": 2.8667449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.03946531, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.06922547112322346, + "language_loss": 0.83528513, + "learning_rate": 0.0005401456977678498, + "loss": 0.8460055, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.32568359, + "step": 2548, + "time_per_iteration": 2.6317896842956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073611, + "balance_loss_mlp": 1.04216361, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06685231557649787, + "language_loss": 0.77518535, + "learning_rate": 0.0005398351535199008, + "loss": 0.78592145, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.31420898, + "step": 2549, + "time_per_iteration": 3.0532455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.046422, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.058433753989977806, + "language_loss": 0.83942944, + "learning_rate": 0.0005395245938065735, + "loss": 0.85020411, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31030273, + "step": 2550, + "time_per_iteration": 2.788081169128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082711, + "balance_loss_mlp": 1.0515734, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.08029752654472934, + "language_loss": 0.83026552, + "learning_rate": 0.0005392140187484379, + "loss": 0.84109271, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.3112793, + "step": 2551, + "time_per_iteration": 2.619982957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076344, + "balance_loss_mlp": 1.04577839, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.05951944251734202, + "language_loss": 0.89720619, + "learning_rate": 0.0005389034284660701, + "loss": 0.90796959, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.30541992, + "step": 2552, + "time_per_iteration": 2.811321258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084609, + "balance_loss_mlp": 1.05349529, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.06813620439924545, + "language_loss": 0.82330388, + "learning_rate": 0.000538592823080052, + "loss": 0.83414996, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.31079102, + "step": 2553, + "time_per_iteration": 3.121729612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_mlp": 1.05181932, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.10151417402847059, + "language_loss": 0.84795117, + "learning_rate": 0.000538282202710971, + "loss": 0.85879219, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.32275391, + "step": 2554, + "time_per_iteration": 2.5441434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089823, + "balance_loss_mlp": 1.05782735, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.08391436989004458, + "language_loss": 0.81955588, + "learning_rate": 0.000537971567479421, + "loss": 0.83045411, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.31982422, + "step": 2555, + "time_per_iteration": 2.742913246154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.05578029, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.0678126955236607, + "language_loss": 0.87735516, + "learning_rate": 0.0005376609175060011, + "loss": 0.88824058, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32763672, + "step": 2556, + "time_per_iteration": 2.5964388847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.05774164, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06456480219532172, + "language_loss": 0.80659723, + "learning_rate": 0.0005373502529113162, + "loss": 0.81748366, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.30883789, + "step": 2557, + "time_per_iteration": 2.8043599128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092017, + "balance_loss_mlp": 1.06009305, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.08818279105065703, + "language_loss": 0.81143486, + "learning_rate": 0.0005370395738159773, + "loss": 0.82235509, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.3190918, + "step": 2558, + "time_per_iteration": 2.6536951065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086446, + "balance_loss_mlp": 1.05516589, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.0699028851556838, + "language_loss": 0.83194804, + "learning_rate": 0.0005367288803406003, + "loss": 0.84281248, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3125, + "step": 2559, + "time_per_iteration": 2.6608238220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06075501, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05624800088650225, + "language_loss": 0.81485915, + "learning_rate": 0.0005364181726058073, + "loss": 0.82578236, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.31542969, + "step": 2560, + "time_per_iteration": 2.7245399951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_mlp": 1.05354452, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.0657433103973406, + "language_loss": 0.82255721, + "learning_rate": 0.0005361074507322261, + "loss": 0.83340329, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.31030273, + "step": 2561, + "time_per_iteration": 2.632309913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05359399, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.06588348626271129, + "language_loss": 0.81683809, + "learning_rate": 0.000535796714840489, + "loss": 0.82768893, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.31494141, + "step": 2562, + "time_per_iteration": 2.6455063819885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107827, + "balance_loss_mlp": 1.04686987, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.07506734855649709, + "language_loss": 0.84067267, + "learning_rate": 0.0005354859650512348, + "loss": 0.85145533, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.3137207, + "step": 2563, + "time_per_iteration": 2.8065779209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075102, + "balance_loss_mlp": 1.04396451, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06295276436461052, + "language_loss": 0.87103295, + "learning_rate": 0.0005351752014851074, + "loss": 0.88178396, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31103516, + "step": 2564, + "time_per_iteration": 2.573575019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078018, + "balance_loss_mlp": 1.04654717, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06464744293940616, + "language_loss": 0.83104938, + "learning_rate": 0.0005348644242627553, + "loss": 0.84182954, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.31445312, + "step": 2565, + "time_per_iteration": 2.730455160140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_mlp": 1.0458622, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.030733727476311833, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76345742, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.1328125, + "step": 2566, + "time_per_iteration": 4.939255237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05290508, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.06048394989907295, + "language_loss": 0.81127739, + "learning_rate": 0.0005342428293320013, + "loss": 0.82211566, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30908203, + "step": 2567, + "time_per_iteration": 2.7613086700439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079847, + "balance_loss_mlp": 1.04899621, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.0745931351859795, + "language_loss": 0.83762527, + "learning_rate": 0.0005339320118649238, + "loss": 0.84842372, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.30810547, + "step": 2568, + "time_per_iteration": 2.6934940814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.04763281, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.16404827309636982, + "language_loss": 0.86383307, + "learning_rate": 0.000533621181224271, + "loss": 0.87461007, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30053711, + "step": 2569, + "time_per_iteration": 2.7757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078612, + "balance_loss_mlp": 1.04737914, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.06859593656518678, + "language_loss": 0.81795698, + "learning_rate": 0.0005333103375307182, + "loss": 0.8287431, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.31201172, + "step": 2570, + "time_per_iteration": 2.8319950103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_mlp": 1.043221, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.05293986738306163, + "language_loss": 0.86142224, + "learning_rate": 0.0005329994809049451, + "loss": 0.87216723, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.3125, + "step": 2571, + "time_per_iteration": 2.7592415809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075993, + "balance_loss_mlp": 1.04540396, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05076322771290774, + "language_loss": 0.87883997, + "learning_rate": 0.0005326886114676375, + "loss": 0.88959992, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.30541992, + "step": 2572, + "time_per_iteration": 2.9501779079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077876, + "balance_loss_mlp": 1.0463568, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.06323365720535751, + "language_loss": 0.87792003, + "learning_rate": 0.0005323777293394854, + "loss": 0.8886987, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.31494141, + "step": 2573, + "time_per_iteration": 2.55361008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107249, + "balance_loss_mlp": 1.03975475, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.05535210432037286, + "language_loss": 0.81776071, + "learning_rate": 0.000532066834641184, + "loss": 0.82848555, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32739258, + "step": 2574, + "time_per_iteration": 2.6631722450256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070737, + "balance_loss_mlp": 1.03900313, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.06817735062049093, + "language_loss": 0.8516283, + "learning_rate": 0.0005317559274934334, + "loss": 0.86233568, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.31713867, + "step": 2575, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072086, + "balance_loss_mlp": 1.03894639, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.05802348124776455, + "language_loss": 0.80394173, + "learning_rate": 0.0005314450080169382, + "loss": 0.81466264, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33154297, + "step": 2576, + "time_per_iteration": 2.6343159675598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076196, + "balance_loss_mlp": 1.04391456, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.07974947058861337, + "language_loss": 0.80607754, + "learning_rate": 0.0005311340763324083, + "loss": 0.81683946, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.32275391, + "step": 2577, + "time_per_iteration": 2.557796001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078498, + "balance_loss_mlp": 1.04557252, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.05295897633494548, + "language_loss": 0.82240456, + "learning_rate": 0.0005308231325605578, + "loss": 0.83318955, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.32910156, + "step": 2578, + "time_per_iteration": 2.6799750328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072444, + "balance_loss_mlp": 1.03992367, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.05054804003557779, + "language_loss": 0.7645728, + "learning_rate": 0.0005305121768221061, + "loss": 0.77529716, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.32519531, + "step": 2579, + "time_per_iteration": 3.074568748474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_mlp": 1.01057923, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02258142627415349, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76063395, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14453125, + "step": 2580, + "time_per_iteration": 4.807044267654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079853, + "balance_loss_mlp": 1.04749966, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.06889886772880317, + "language_loss": 0.9145242, + "learning_rate": 0.0005298902299282984, + "loss": 0.92532271, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.32348633, + "step": 2581, + "time_per_iteration": 2.6145668029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077544, + "balance_loss_mlp": 1.04561996, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.06407878407439609, + "language_loss": 0.84137404, + "learning_rate": 0.0005295792390144033, + "loss": 0.85214949, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.3190918, + "step": 2582, + "time_per_iteration": 2.71272873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083171, + "balance_loss_mlp": 1.05103219, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.07436197165654145, + "language_loss": 0.83241105, + "learning_rate": 0.0005292682366168294, + "loss": 0.84324276, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.32128906, + "step": 2583, + "time_per_iteration": 2.5284125804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082483, + "balance_loss_mlp": 1.05079746, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.07965760723765093, + "language_loss": 0.79750967, + "learning_rate": 0.0005289572228563181, + "loss": 0.80833459, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.31665039, + "step": 2584, + "time_per_iteration": 2.802370548248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.04862666, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.06536047089469768, + "language_loss": 0.83144403, + "learning_rate": 0.000528646197853616, + "loss": 0.84224886, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.31835938, + "step": 2585, + "time_per_iteration": 2.7075467109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076886, + "balance_loss_mlp": 1.04748917, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.11136041462628715, + "language_loss": 0.85364115, + "learning_rate": 0.0005283351617294735, + "loss": 0.86440998, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.29370117, + "step": 2586, + "time_per_iteration": 2.940826892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_mlp": 1.0143584, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01813039431029953, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.7766428, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.1328125, + "step": 2587, + "time_per_iteration": 4.996971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082207, + "balance_loss_mlp": 1.05278599, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05663819997496981, + "language_loss": 0.86729956, + "learning_rate": 0.0005277130565998916, + "loss": 0.87812161, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.29394531, + "step": 2588, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_mlp": 1.05401921, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.07264241635107661, + "language_loss": 0.82111955, + "learning_rate": 0.0005274019878359748, + "loss": 0.83195567, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.29541016, + "step": 2589, + "time_per_iteration": 2.7199792861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081352, + "balance_loss_mlp": 1.05102515, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.07554474334702437, + "language_loss": 0.86675328, + "learning_rate": 0.0005270909084336628, + "loss": 0.87756681, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.30297852, + "step": 2590, + "time_per_iteration": 2.6305181980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080877, + "balance_loss_mlp": 1.05045462, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.06751539177219479, + "language_loss": 0.89032745, + "learning_rate": 0.0005267798185137276, + "loss": 0.90113628, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.30371094, + "step": 2591, + "time_per_iteration": 2.608088254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_mlp": 1.05743146, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.0633807963563003, + "language_loss": 0.8924402, + "learning_rate": 0.0005264687181969444, + "loss": 0.90332258, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.30786133, + "step": 2592, + "time_per_iteration": 2.729546308517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088496, + "balance_loss_mlp": 1.05931377, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06112732681279078, + "language_loss": 0.75084651, + "learning_rate": 0.0005261576076040937, + "loss": 0.76173151, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.29199219, + "step": 2593, + "time_per_iteration": 3.265289783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082947, + "balance_loss_mlp": 1.05281067, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.0783599565062882, + "language_loss": 0.84088343, + "learning_rate": 0.0005258464868559591, + "loss": 0.85171294, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.30078125, + "step": 2594, + "time_per_iteration": 2.657191514968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080029, + "balance_loss_mlp": 1.04991674, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.0699675322535813, + "language_loss": 0.88836402, + "learning_rate": 0.0005255353560733284, + "loss": 0.89916426, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.30102539, + "step": 2595, + "time_per_iteration": 2.570439100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_mlp": 1.04640186, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.029272008197333242, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76637447, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.12353516, + "step": 2596, + "time_per_iteration": 4.808587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084167, + "balance_loss_mlp": 1.05476975, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052965599041123274, + "language_loss": 0.83342099, + "learning_rate": 0.0005249130648877492, + "loss": 0.84426272, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.29370117, + "step": 2597, + "time_per_iteration": 2.7453384399414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010849, + "balance_loss_mlp": 1.05524063, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05960347084431116, + "language_loss": 0.84714389, + "learning_rate": 0.0005246019047263953, + "loss": 0.85799289, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.29614258, + "step": 2598, + "time_per_iteration": 2.488004684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091385, + "balance_loss_mlp": 1.06220269, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.06961248878544336, + "language_loss": 0.8223601, + "learning_rate": 0.0005242907350137353, + "loss": 0.83327389, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.29174805, + "step": 2599, + "time_per_iteration": 2.550495147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092431, + "balance_loss_mlp": 1.06422567, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06813860338073652, + "language_loss": 0.78928339, + "learning_rate": 0.0005239795558705754, + "loss": 0.80020773, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.28198242, + "step": 2600, + "time_per_iteration": 2.656519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094846, + "balance_loss_mlp": 1.06492448, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05508549334218052, + "language_loss": 0.89073658, + "learning_rate": 0.0005236683674177264, + "loss": 0.90168506, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.29907227, + "step": 2601, + "time_per_iteration": 2.63960337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098261, + "balance_loss_mlp": 1.06886423, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.06683201790232274, + "language_loss": 0.82384604, + "learning_rate": 0.0005233571697760021, + "loss": 0.83482862, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.29345703, + "step": 2602, + "time_per_iteration": 2.859165668487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06814075, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.06216601268510387, + "language_loss": 0.83124363, + "learning_rate": 0.0005230459630662203, + "loss": 0.84222066, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.29541016, + "step": 2603, + "time_per_iteration": 2.9592032432556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093592, + "balance_loss_mlp": 1.06479144, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.0707725537041266, + "language_loss": 0.81070089, + "learning_rate": 0.0005227347474092022, + "loss": 0.8216368, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.2878418, + "step": 2604, + "time_per_iteration": 2.7389962673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545365, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.05232832672790962, + "language_loss": 0.83514917, + "learning_rate": 0.0005224235229257724, + "loss": 0.84609556, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.29174805, + "step": 2605, + "time_per_iteration": 2.687992811203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.05914283, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.056206575952308185, + "language_loss": 0.8630116, + "learning_rate": 0.0005221122897367589, + "loss": 0.87389988, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.29614258, + "step": 2606, + "time_per_iteration": 2.787410259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.05861855, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.07695466326694751, + "language_loss": 0.81035262, + "learning_rate": 0.0005218010479629932, + "loss": 0.82123399, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.29467773, + "step": 2607, + "time_per_iteration": 2.6562912464141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.06177175, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.05799380231795743, + "language_loss": 0.81869501, + "learning_rate": 0.0005214897977253102, + "loss": 0.82961148, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.29833984, + "step": 2608, + "time_per_iteration": 2.6560218334198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_mlp": 1.05454254, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.06343008203006618, + "language_loss": 0.84223098, + "learning_rate": 0.0005211785391445473, + "loss": 0.85307777, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.30102539, + "step": 2609, + "time_per_iteration": 2.726686954498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_mlp": 1.05202734, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.06012661278609564, + "language_loss": 0.79186547, + "learning_rate": 0.0005208672723415467, + "loss": 0.80267924, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.29345703, + "step": 2610, + "time_per_iteration": 2.7944774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108238, + "balance_loss_mlp": 1.05212474, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.06559501481836318, + "language_loss": 0.79065204, + "learning_rate": 0.0005205559974371525, + "loss": 0.80147582, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.30224609, + "step": 2611, + "time_per_iteration": 2.7519257068634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081519, + "balance_loss_mlp": 1.05150175, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05612255210767107, + "language_loss": 0.82192892, + "learning_rate": 0.0005202447145522123, + "loss": 0.83274412, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.29980469, + "step": 2612, + "time_per_iteration": 2.6770236492156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079077, + "balance_loss_mlp": 1.04965591, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05250196134528315, + "language_loss": 0.79193181, + "learning_rate": 0.0005199334238075769, + "loss": 0.80272257, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.29370117, + "step": 2613, + "time_per_iteration": 2.5337562561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107987, + "balance_loss_mlp": 1.04942441, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.0529792440436354, + "language_loss": 0.9204368, + "learning_rate": 0.0005196221253241, + "loss": 0.93123555, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.30419922, + "step": 2614, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04276693, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.06195019445138367, + "language_loss": 0.82918042, + "learning_rate": 0.0005193108192226383, + "loss": 0.83991992, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.31152344, + "step": 2615, + "time_per_iteration": 2.757087230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080642, + "balance_loss_mlp": 1.04990983, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.05317989185447873, + "language_loss": 0.8697142, + "learning_rate": 0.000518999505624052, + "loss": 0.88052064, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.30712891, + "step": 2616, + "time_per_iteration": 2.7251224517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078998, + "balance_loss_mlp": 1.04759884, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.059314577611761586, + "language_loss": 0.83379316, + "learning_rate": 0.000518688184649203, + "loss": 0.84458327, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.3137207, + "step": 2617, + "time_per_iteration": 2.809063673019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107933, + "balance_loss_mlp": 1.04890776, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.08232681701976922, + "language_loss": 0.83759677, + "learning_rate": 0.0005183768564189577, + "loss": 0.8483901, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.30395508, + "step": 2618, + "time_per_iteration": 2.5442681312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_mlp": 1.05502236, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.10233936422342303, + "language_loss": 0.81248713, + "learning_rate": 0.0005180655210541838, + "loss": 0.8233487, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31103516, + "step": 2619, + "time_per_iteration": 2.5986533164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04976153, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.10286286455085811, + "language_loss": 0.83096433, + "learning_rate": 0.0005177541786757527, + "loss": 0.84175664, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.29443359, + "step": 2620, + "time_per_iteration": 2.7542781829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04971933, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.062363268760676084, + "language_loss": 0.82867718, + "learning_rate": 0.000517442829404538, + "loss": 0.83948314, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.30834961, + "step": 2621, + "time_per_iteration": 2.9758973121643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080161, + "balance_loss_mlp": 1.05000091, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.06818258917584033, + "language_loss": 0.8721652, + "learning_rate": 0.0005171314733614166, + "loss": 0.88296676, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.30102539, + "step": 2622, + "time_per_iteration": 2.8933780193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082583, + "balance_loss_mlp": 1.05235183, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.06917321427090362, + "language_loss": 0.78315443, + "learning_rate": 0.0005168201106672671, + "loss": 0.79398024, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.30200195, + "step": 2623, + "time_per_iteration": 2.763855457305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.05093241, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.06294733427077812, + "language_loss": 0.84776348, + "learning_rate": 0.0005165087414429717, + "loss": 0.85857534, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.30200195, + "step": 2624, + "time_per_iteration": 2.6454148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04967785, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.07820570667172376, + "language_loss": 0.83597136, + "learning_rate": 0.0005161973658094144, + "loss": 0.84677643, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.30810547, + "step": 2625, + "time_per_iteration": 2.630192756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075312, + "balance_loss_mlp": 1.04562938, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.10754310805258371, + "language_loss": 0.8215518, + "learning_rate": 0.000515885983887482, + "loss": 0.83230495, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.29614258, + "step": 2626, + "time_per_iteration": 2.762484312057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082022, + "balance_loss_mlp": 1.05179107, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.060931372363222436, + "language_loss": 0.84606075, + "learning_rate": 0.0005155745957980636, + "loss": 0.85688096, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.30175781, + "step": 2627, + "time_per_iteration": 2.597625494003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04513431, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.060140239439456865, + "language_loss": 0.8829447, + "learning_rate": 0.000515263201662051, + "loss": 0.89370334, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.30688477, + "step": 2628, + "time_per_iteration": 2.676429510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081664, + "balance_loss_mlp": 1.05162382, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05201747216110034, + "language_loss": 0.82525623, + "learning_rate": 0.0005149518016003378, + "loss": 0.83607286, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.30004883, + "step": 2629, + "time_per_iteration": 3.1674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.04874492, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.12452297981638945, + "language_loss": 0.82290918, + "learning_rate": 0.0005146403957338206, + "loss": 0.83369756, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30029297, + "step": 2630, + "time_per_iteration": 2.574908494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075266, + "balance_loss_mlp": 1.04415226, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.054026792513587725, + "language_loss": 0.81795335, + "learning_rate": 0.0005143289841833975, + "loss": 0.82870597, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31079102, + "step": 2631, + "time_per_iteration": 2.8753445148468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.04044628, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.07665080268010696, + "language_loss": 0.82169271, + "learning_rate": 0.0005140175670699696, + "loss": 0.83241099, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.31347656, + "step": 2632, + "time_per_iteration": 2.606656551361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070677, + "balance_loss_mlp": 1.03989697, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.05365826465054309, + "language_loss": 0.82773447, + "learning_rate": 0.0005137061445144395, + "loss": 0.83844125, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.30737305, + "step": 2633, + "time_per_iteration": 2.908146619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.0429641, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.06908817272508659, + "language_loss": 0.87031686, + "learning_rate": 0.000513394716637712, + "loss": 0.88106334, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.31665039, + "step": 2634, + "time_per_iteration": 2.804591417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03547585, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.027149993512400487, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80241489, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.14257812, + "step": 2635, + "time_per_iteration": 4.903238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.03977799, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.05829667092367474, + "language_loss": 0.80886006, + "learning_rate": 0.0005127718454042958, + "loss": 0.81957495, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.31689453, + "step": 2636, + "time_per_iteration": 2.81962513923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076357, + "balance_loss_mlp": 1.04467094, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.06782185148260642, + "language_loss": 0.84239292, + "learning_rate": 0.0005124604022894269, + "loss": 0.85315657, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.31665039, + "step": 2637, + "time_per_iteration": 2.933143377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023059, + "balance_loss_mlp": 1.00932586, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.016037159370544805, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78211284, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.13769531, + "step": 2638, + "time_per_iteration": 4.81339168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080028, + "balance_loss_mlp": 1.04786575, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.058900205072543066, + "language_loss": 0.83262694, + "learning_rate": 0.0005118375016679325, + "loss": 0.84342724, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.3215332, + "step": 2639, + "time_per_iteration": 2.7476773262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076278, + "balance_loss_mlp": 1.04490256, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.08436499818571505, + "language_loss": 0.80410182, + "learning_rate": 0.0005115260444031382, + "loss": 0.81486464, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.31347656, + "step": 2640, + "time_per_iteration": 2.579087734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016776, + "balance_loss_mlp": 1.00361574, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.010326775178219767, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79748595, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.13183594, + "step": 2641, + "time_per_iteration": 4.939114809036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077717, + "balance_loss_mlp": 1.04665077, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.06392423646026814, + "language_loss": 0.86441147, + "learning_rate": 0.0005109031165700483, + "loss": 0.87518859, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.31030273, + "step": 2642, + "time_per_iteration": 2.572248935699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.04809904, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.08514760687851525, + "language_loss": 0.83290648, + "learning_rate": 0.0005105916462435945, + "loss": 0.84369576, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.30786133, + "step": 2643, + "time_per_iteration": 2.832653284072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.05089569, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.05584396132467612, + "language_loss": 0.85012162, + "learning_rate": 0.0005102801718050989, + "loss": 0.86093414, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.30322266, + "step": 2644, + "time_per_iteration": 2.6693568229675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.04755831, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.07396400679887168, + "language_loss": 0.89154196, + "learning_rate": 0.0005099686933754867, + "loss": 0.9023155, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.29785156, + "step": 2645, + "time_per_iteration": 2.688992977142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080157, + "balance_loss_mlp": 1.05016422, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.06521042739972126, + "language_loss": 0.84349567, + "learning_rate": 0.0005096572110756845, + "loss": 0.85429722, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.29956055, + "step": 2646, + "time_per_iteration": 2.694018840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080367, + "balance_loss_mlp": 1.05065989, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.049776737751643374, + "language_loss": 0.85623205, + "learning_rate": 0.0005093457250266205, + "loss": 0.86703575, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.296875, + "step": 2647, + "time_per_iteration": 2.69240665435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_mlp": 1.05527472, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.0639130152108818, + "language_loss": 0.83146644, + "learning_rate": 0.000509034235349224, + "loss": 0.84231722, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.29760742, + "step": 2648, + "time_per_iteration": 2.69409441947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_mlp": 1.05499578, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.07990516858852505, + "language_loss": 0.81340408, + "learning_rate": 0.0005087227421644266, + "loss": 0.82424831, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.29345703, + "step": 2649, + "time_per_iteration": 2.7338664531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.05795491, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.06481094949829869, + "language_loss": 0.86482179, + "learning_rate": 0.0005084112455931602, + "loss": 0.87570059, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.29907227, + "step": 2650, + "time_per_iteration": 2.5772013664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085843, + "balance_loss_mlp": 1.05561161, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.060404574220966636, + "language_loss": 0.84966755, + "learning_rate": 0.0005080997457563586, + "loss": 0.86052603, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.30200195, + "step": 2651, + "time_per_iteration": 2.5539023876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089212, + "balance_loss_mlp": 1.05895662, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06895787175374923, + "language_loss": 0.79026747, + "learning_rate": 0.0005077882427749569, + "loss": 0.80115962, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.30224609, + "step": 2652, + "time_per_iteration": 2.5036137104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.06367242, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06232251007114316, + "language_loss": 0.84676695, + "learning_rate": 0.0005074767367698913, + "loss": 0.85770237, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.29833984, + "step": 2653, + "time_per_iteration": 2.6879539489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088747, + "balance_loss_mlp": 1.05875421, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.07002300864013745, + "language_loss": 0.83262461, + "learning_rate": 0.0005071652278620988, + "loss": 0.84351206, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.29956055, + "step": 2654, + "time_per_iteration": 3.048330307006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093234, + "balance_loss_mlp": 1.06369376, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.077240918193036, + "language_loss": 0.83515394, + "learning_rate": 0.0005068537161725186, + "loss": 0.84608626, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.29492188, + "step": 2655, + "time_per_iteration": 2.7864887714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088669, + "balance_loss_mlp": 1.05941546, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.06396168128091786, + "language_loss": 0.84455109, + "learning_rate": 0.0005065422018220893, + "loss": 0.85543782, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.29223633, + "step": 2656, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095041, + "balance_loss_mlp": 1.0650475, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.0709037558233959, + "language_loss": 0.7998327, + "learning_rate": 0.0005062306849317521, + "loss": 0.81078309, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.29956055, + "step": 2657, + "time_per_iteration": 2.7980425357818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010852, + "balance_loss_mlp": 1.05484891, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.0652959904845647, + "language_loss": 0.83424717, + "learning_rate": 0.0005059191656224487, + "loss": 0.84509915, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30297852, + "step": 2658, + "time_per_iteration": 2.735557794570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_mlp": 1.05488813, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.05645977889013881, + "language_loss": 0.89198554, + "learning_rate": 0.0005056076440151212, + "loss": 0.90283966, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.3046875, + "step": 2659, + "time_per_iteration": 2.651273012161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136875, + "balance_loss_mlp": 1.12314212, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.05420368374393455, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77424991, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.13769531, + "step": 2660, + "time_per_iteration": 4.8447229862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_mlp": 1.05689311, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.04523661755748661, + "language_loss": 0.87268543, + "learning_rate": 0.0005049845943901691, + "loss": 0.88354003, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.28515625, + "step": 2661, + "time_per_iteration": 2.855107307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.05092359, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05522645200412479, + "language_loss": 0.86379933, + "learning_rate": 0.0005046730666144338, + "loss": 0.87459898, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.2902832, + "step": 2662, + "time_per_iteration": 2.841339349746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082682, + "balance_loss_mlp": 1.05390453, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.05374936854204756, + "language_loss": 0.87915027, + "learning_rate": 0.0005043615370244532, + "loss": 0.8899771, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.2878418, + "step": 2663, + "time_per_iteration": 3.364856004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03728747, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.022479341124125186, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79294169, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.125, + "step": 2664, + "time_per_iteration": 4.635313510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080439, + "balance_loss_mlp": 1.05163848, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04479435391735135, + "language_loss": 0.85200715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86281157, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.28808594, + "step": 2665, + "time_per_iteration": 2.7995188236236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083297, + "balance_loss_mlp": 1.05356586, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.0801864670549744, + "language_loss": 0.84280151, + "learning_rate": 0.0005034269385785075, + "loss": 0.85363448, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.29711914, + "step": 2666, + "time_per_iteration": 2.673332929611206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090699, + "balance_loss_mlp": 1.0623982, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06501156427369086, + "language_loss": 0.84454274, + "learning_rate": 0.0005031154029410168, + "loss": 0.85544968, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.28344727, + "step": 2667, + "time_per_iteration": 2.5442566871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086564, + "balance_loss_mlp": 1.0577395, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06480382372099369, + "language_loss": 0.86841118, + "learning_rate": 0.0005028038660940197, + "loss": 0.87927675, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.28808594, + "step": 2668, + "time_per_iteration": 2.62888765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077032, + "balance_loss_mlp": 1.04832673, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.05084400085528349, + "language_loss": 0.84573722, + "learning_rate": 0.0005024923281584648, + "loss": 0.85650754, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.28662109, + "step": 2669, + "time_per_iteration": 2.6316568851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092041, + "balance_loss_mlp": 1.06312072, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.05870793453685439, + "language_loss": 0.82656723, + "learning_rate": 0.0005021807892553026, + "loss": 0.83748764, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.28881836, + "step": 2670, + "time_per_iteration": 2.707345724105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093085, + "balance_loss_mlp": 1.06457078, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.08829821247143162, + "language_loss": 0.84517181, + "learning_rate": 0.0005018692495054828, + "loss": 0.85610259, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.28540039, + "step": 2671, + "time_per_iteration": 2.758309841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092768, + "balance_loss_mlp": 1.06399131, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05555500929459815, + "language_loss": 0.80821186, + "learning_rate": 0.0005015577090299561, + "loss": 0.8191396, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.28735352, + "step": 2672, + "time_per_iteration": 2.6883137226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.06125236, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.06705414985084517, + "language_loss": 0.86672199, + "learning_rate": 0.0005012461679496729, + "loss": 0.87762225, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.28759766, + "step": 2673, + "time_per_iteration": 2.5949177742004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092599, + "balance_loss_mlp": 1.0630827, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.06054107713253035, + "language_loss": 0.87204134, + "learning_rate": 0.0005009346263855848, + "loss": 0.88296735, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.29467773, + "step": 2674, + "time_per_iteration": 2.6084070205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093368, + "balance_loss_mlp": 1.06401849, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.08912792131396882, + "language_loss": 0.83928424, + "learning_rate": 0.0005006230844586422, + "loss": 0.85021788, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.29345703, + "step": 2675, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06496692, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.06185145068902706, + "language_loss": 0.79025733, + "learning_rate": 0.0005003115422897968, + "loss": 0.80119741, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.29052734, + "step": 2676, + "time_per_iteration": 2.7350447177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_mlp": 1.05780196, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.06610854708750855, + "language_loss": 0.86982405, + "learning_rate": 0.0005, + "loss": 0.88070583, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.30322266, + "step": 2677, + "time_per_iteration": 2.62941837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082976, + "balance_loss_mlp": 1.0535078, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.05650592481949535, + "language_loss": 0.7918483, + "learning_rate": 0.0004996884577102033, + "loss": 0.80267811, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.29418945, + "step": 2678, + "time_per_iteration": 3.1128311157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085723, + "balance_loss_mlp": 1.05577731, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.05289591163695072, + "language_loss": 0.84550285, + "learning_rate": 0.000499376915541358, + "loss": 0.85636008, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.29907227, + "step": 2679, + "time_per_iteration": 2.709259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_mlp": 1.0510838, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.05812477607611756, + "language_loss": 0.81116259, + "learning_rate": 0.0004990653736144155, + "loss": 0.82198453, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31079102, + "step": 2680, + "time_per_iteration": 2.8433125019073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083796, + "balance_loss_mlp": 1.05318332, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.06443376303588658, + "language_loss": 0.8582924, + "learning_rate": 0.0004987538320503271, + "loss": 0.86913037, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.30566406, + "step": 2681, + "time_per_iteration": 2.492128372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04860437, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.06119575969443392, + "language_loss": 0.83057904, + "learning_rate": 0.0004984422909700442, + "loss": 0.84137553, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.31005859, + "step": 2682, + "time_per_iteration": 2.6817965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04560328, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.06357079240733023, + "language_loss": 0.83849651, + "learning_rate": 0.0004981307504945173, + "loss": 0.84926826, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31542969, + "step": 2683, + "time_per_iteration": 2.6884219646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04764211, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.058627663819765745, + "language_loss": 0.89028186, + "learning_rate": 0.0004978192107446976, + "loss": 0.90106535, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.30664062, + "step": 2684, + "time_per_iteration": 2.7606394290924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074512, + "balance_loss_mlp": 1.04397011, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05338243685455816, + "language_loss": 0.870161, + "learning_rate": 0.0004975076718415353, + "loss": 0.88090611, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30493164, + "step": 2685, + "time_per_iteration": 2.594937562942505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081075, + "balance_loss_mlp": 1.04991364, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.06078629774986462, + "language_loss": 0.90568233, + "learning_rate": 0.0004971961339059806, + "loss": 0.91649306, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.3112793, + "step": 2686, + "time_per_iteration": 2.4705729484558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075772, + "balance_loss_mlp": 1.04406273, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.067622669815522, + "language_loss": 0.83813852, + "learning_rate": 0.0004968845970589832, + "loss": 0.84889627, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.31689453, + "step": 2687, + "time_per_iteration": 2.6784517765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108779, + "balance_loss_mlp": 1.05760634, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06982295057413529, + "language_loss": 0.84568465, + "learning_rate": 0.0004965730614214926, + "loss": 0.85656255, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.30151367, + "step": 2688, + "time_per_iteration": 2.628742218017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078435, + "balance_loss_mlp": 1.0470829, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.06558972316908819, + "language_loss": 0.85422957, + "learning_rate": 0.0004962615271144576, + "loss": 0.86501396, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.31323242, + "step": 2689, + "time_per_iteration": 2.5566818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079558, + "balance_loss_mlp": 1.04923093, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.32559574880762837, + "language_loss": 0.82639515, + "learning_rate": 0.0004959499942588264, + "loss": 0.83719069, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.30273438, + "step": 2690, + "time_per_iteration": 2.8994317054748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057487, + "balance_loss_mlp": 1.04442203, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.028996752449645728, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79257512, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.13085938, + "step": 2691, + "time_per_iteration": 4.746784687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109471, + "balance_loss_mlp": 1.07830977, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.12339515707636219, + "language_loss": 0.85558736, + "learning_rate": 0.0004953269333855661, + "loss": 0.86668211, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.3112793, + "step": 2692, + "time_per_iteration": 2.8191914558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07991028, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.07785846219337349, + "language_loss": 0.84034789, + "learning_rate": 0.0004950154056098309, + "loss": 0.85143995, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.29272461, + "step": 2693, + "time_per_iteration": 2.686821222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129818, + "balance_loss_mlp": 1.09963465, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.07144537100010277, + "language_loss": 0.83820134, + "learning_rate": 0.0004947038797692867, + "loss": 0.84949952, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.30126953, + "step": 2694, + "time_per_iteration": 2.8041090965270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128051, + "balance_loss_mlp": 1.09741426, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.06183052783496024, + "language_loss": 0.77540803, + "learning_rate": 0.0004943923559848789, + "loss": 0.78668851, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.3059082, + "step": 2695, + "time_per_iteration": 2.797661781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127895, + "balance_loss_mlp": 1.09756875, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.054443821670517534, + "language_loss": 0.90626478, + "learning_rate": 0.0004940808343775515, + "loss": 0.91754371, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.30297852, + "step": 2696, + "time_per_iteration": 2.708075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126092, + "balance_loss_mlp": 1.09593177, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.08653085411735448, + "language_loss": 0.82187402, + "learning_rate": 0.0004937693150682479, + "loss": 0.83313495, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.30126953, + "step": 2697, + "time_per_iteration": 2.5607407093048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116261, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.07683001308624603, + "language_loss": 0.76774538, + "learning_rate": 0.0004934577981779107, + "loss": 0.77890801, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.30175781, + "step": 2698, + "time_per_iteration": 2.730090618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112238, + "balance_loss_mlp": 1.0813148, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.05605263998280499, + "language_loss": 0.81117129, + "learning_rate": 0.0004931462838274817, + "loss": 0.82229376, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.30883789, + "step": 2699, + "time_per_iteration": 2.847720146179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_mlp": 1.07957006, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.0574424557407856, + "language_loss": 0.84004086, + "learning_rate": 0.0004928347721379011, + "loss": 0.85114038, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.30322266, + "step": 2700, + "time_per_iteration": 2.6999762058258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_mlp": 1.07185948, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.05483286228362013, + "language_loss": 0.82044077, + "learning_rate": 0.0004925232632301089, + "loss": 0.83146882, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.30908203, + "step": 2701, + "time_per_iteration": 2.560593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098243, + "balance_loss_mlp": 1.06791615, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.06379159996009351, + "language_loss": 0.79575932, + "learning_rate": 0.0004922117572250431, + "loss": 0.80674177, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.30273438, + "step": 2702, + "time_per_iteration": 2.6621010303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094553, + "balance_loss_mlp": 1.0648458, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.06234734694325623, + "language_loss": 0.80990833, + "learning_rate": 0.0004919002542436414, + "loss": 0.82085389, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.296875, + "step": 2703, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.06806874, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.11086337696641164, + "language_loss": 0.81129456, + "learning_rate": 0.0004915887544068399, + "loss": 0.82227564, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.29980469, + "step": 2704, + "time_per_iteration": 2.6579208374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097204, + "balance_loss_mlp": 1.06787837, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.06500287710368027, + "language_loss": 0.78155613, + "learning_rate": 0.0004912772578355736, + "loss": 0.79252815, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.29296875, + "step": 2705, + "time_per_iteration": 2.93152117729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.06395674, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.05937288472032104, + "language_loss": 0.82798421, + "learning_rate": 0.000490965764650776, + "loss": 0.83892947, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.30541992, + "step": 2706, + "time_per_iteration": 2.914069414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090504, + "balance_loss_mlp": 1.06048679, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.08994605713309432, + "language_loss": 0.82582623, + "learning_rate": 0.0004906542749733798, + "loss": 0.83673131, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.29980469, + "step": 2707, + "time_per_iteration": 3.632612943649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.05647707, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.05099864574791971, + "language_loss": 0.85112798, + "learning_rate": 0.0004903427889243156, + "loss": 0.86199224, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.29907227, + "step": 2708, + "time_per_iteration": 2.860605001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05898452, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.058285600596581014, + "language_loss": 0.85712206, + "learning_rate": 0.0004900313066245134, + "loss": 0.86801398, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.30151367, + "step": 2709, + "time_per_iteration": 2.6910862922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078824, + "balance_loss_mlp": 1.04873538, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.06298998318770882, + "language_loss": 0.81023324, + "learning_rate": 0.0004897198281949012, + "loss": 0.8210215, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.30029297, + "step": 2710, + "time_per_iteration": 2.660783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085709, + "balance_loss_mlp": 1.0563364, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.06559869836216795, + "language_loss": 0.77832824, + "learning_rate": 0.0004894083537564057, + "loss": 0.78918535, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.29345703, + "step": 2711, + "time_per_iteration": 2.7276909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079715, + "balance_loss_mlp": 1.04965043, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.0684248274147048, + "language_loss": 0.80827081, + "learning_rate": 0.0004890968834299519, + "loss": 0.81906796, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.30029297, + "step": 2712, + "time_per_iteration": 2.738229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.04974508, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.061787257592987296, + "language_loss": 0.78808606, + "learning_rate": 0.0004887854173364633, + "loss": 0.79888272, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.29882812, + "step": 2713, + "time_per_iteration": 2.734443426132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074151, + "balance_loss_mlp": 1.04480171, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.05102910961180143, + "language_loss": 0.81491256, + "learning_rate": 0.0004884739555968617, + "loss": 0.82565403, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.29272461, + "step": 2714, + "time_per_iteration": 2.867036819458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.05559933, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.021468860083039186, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80046767, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.14160156, + "step": 2715, + "time_per_iteration": 4.962530851364136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04559731, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.06298546380073215, + "language_loss": 0.86646473, + "learning_rate": 0.0004878510456629992, + "loss": 0.87722689, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.30566406, + "step": 2716, + "time_per_iteration": 2.9603123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081784, + "balance_loss_mlp": 1.05110002, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.07025764068668285, + "language_loss": 0.85336471, + "learning_rate": 0.00048753959771057314, + "loss": 0.86418259, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.30639648, + "step": 2717, + "time_per_iteration": 2.632622480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_mlp": 1.05389357, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.05729998182106491, + "language_loss": 0.82715809, + "learning_rate": 0.0004872281545957044, + "loss": 0.83801079, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.31347656, + "step": 2718, + "time_per_iteration": 2.7305338382720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078735, + "balance_loss_mlp": 1.04726386, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.058019575066879846, + "language_loss": 0.86264348, + "learning_rate": 0.0004869167164393055, + "loss": 0.87343085, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.31445312, + "step": 2719, + "time_per_iteration": 2.9418067932128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_mlp": 1.04472566, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.0640312473735956, + "language_loss": 0.89536262, + "learning_rate": 0.00048660528336228793, + "loss": 0.90611863, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.30834961, + "step": 2720, + "time_per_iteration": 2.8314764499664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04506063, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.05104764752581424, + "language_loss": 0.89906192, + "learning_rate": 0.0004862938554855606, + "loss": 0.90981793, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.30517578, + "step": 2721, + "time_per_iteration": 2.7912685871124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.04705238, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.09225462001304952, + "language_loss": 0.86140561, + "learning_rate": 0.0004859824329300304, + "loss": 0.87217844, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.30200195, + "step": 2722, + "time_per_iteration": 2.5850255489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081058, + "balance_loss_mlp": 1.0504688, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.05217438950511115, + "language_loss": 0.83504456, + "learning_rate": 0.00048567101581660244, + "loss": 0.84585512, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.30541992, + "step": 2723, + "time_per_iteration": 2.6090264320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.04712343, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.07777816613104971, + "language_loss": 0.8713702, + "learning_rate": 0.00048535960426617956, + "loss": 0.88215029, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.30834961, + "step": 2724, + "time_per_iteration": 2.6143879890441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079989, + "balance_loss_mlp": 1.04966187, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.061907794652793086, + "language_loss": 0.81729943, + "learning_rate": 0.0004850481983996621, + "loss": 0.82809931, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.30273438, + "step": 2725, + "time_per_iteration": 2.7439112663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.05174541, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.06296520541747418, + "language_loss": 0.87762207, + "learning_rate": 0.0004847367983379492, + "loss": 0.88844043, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.30053711, + "step": 2726, + "time_per_iteration": 2.497286796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080055, + "balance_loss_mlp": 1.05056226, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.09099502950257793, + "language_loss": 0.78826892, + "learning_rate": 0.00048442540420193643, + "loss": 0.79906946, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.29418945, + "step": 2727, + "time_per_iteration": 2.9191126823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077698, + "balance_loss_mlp": 1.04751396, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.061166777448516674, + "language_loss": 0.79150236, + "learning_rate": 0.0004841140161125182, + "loss": 0.80227935, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.30126953, + "step": 2728, + "time_per_iteration": 3.5845582485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082892, + "balance_loss_mlp": 1.05306578, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.06421237850995067, + "language_loss": 0.84691751, + "learning_rate": 0.0004838026341905857, + "loss": 0.85774648, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.29785156, + "step": 2729, + "time_per_iteration": 2.75872540473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.05010509, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.051610102750965434, + "language_loss": 0.85352898, + "learning_rate": 0.00048349125855702844, + "loss": 0.86433375, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.30322266, + "step": 2730, + "time_per_iteration": 2.7679519653320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108307, + "balance_loss_mlp": 1.05322015, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.05904184367240025, + "language_loss": 0.81296933, + "learning_rate": 0.00048317988933273287, + "loss": 0.82380003, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.29785156, + "step": 2731, + "time_per_iteration": 2.7559163570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079843, + "balance_loss_mlp": 1.0495404, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.06321650060381495, + "language_loss": 0.8227402, + "learning_rate": 0.00048286852663858367, + "loss": 0.83353865, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.30273438, + "step": 2732, + "time_per_iteration": 2.9430267810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077146, + "balance_loss_mlp": 1.04710531, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.05929618739033729, + "language_loss": 0.84009433, + "learning_rate": 0.000482557170595462, + "loss": 0.85086572, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.30004883, + "step": 2733, + "time_per_iteration": 2.914397954940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.05194473, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.05379595829627383, + "language_loss": 0.87649244, + "learning_rate": 0.0004822458213242475, + "loss": 0.88732612, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31396484, + "step": 2734, + "time_per_iteration": 2.533350944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082101, + "balance_loss_mlp": 1.05215609, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.15308762813128413, + "language_loss": 0.85928154, + "learning_rate": 0.00048193447894581627, + "loss": 0.87010252, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.29882812, + "step": 2735, + "time_per_iteration": 3.0971109867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081636, + "balance_loss_mlp": 1.05190539, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.059512944610192846, + "language_loss": 0.88020355, + "learning_rate": 0.00048162314358104243, + "loss": 0.89101994, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.296875, + "step": 2736, + "time_per_iteration": 2.619262456893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.05268502, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.05996263826740056, + "language_loss": 0.83247852, + "learning_rate": 0.0004813118153507969, + "loss": 0.84329623, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.29052734, + "step": 2737, + "time_per_iteration": 2.724499464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.06603909, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.02099488410784391, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83527088, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13964844, + "step": 2738, + "time_per_iteration": 4.7655651569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109097, + "balance_loss_mlp": 1.06135821, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.054521404688675106, + "language_loss": 0.83406657, + "learning_rate": 0.00048068918077736163, + "loss": 0.84497625, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.29541016, + "step": 2739, + "time_per_iteration": 3.2117719650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_mlp": 1.05820239, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.06027403163408104, + "language_loss": 0.81200749, + "learning_rate": 0.0004803778746759001, + "loss": 0.82288492, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.29492188, + "step": 2740, + "time_per_iteration": 2.883953809738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085865, + "balance_loss_mlp": 1.05627775, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.07072803117785999, + "language_loss": 0.81773007, + "learning_rate": 0.00048006657619242317, + "loss": 0.82858872, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.29541016, + "step": 2741, + "time_per_iteration": 2.6289987564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108813, + "balance_loss_mlp": 1.05959105, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.07275993710061575, + "language_loss": 0.78293514, + "learning_rate": 0.00047975528544778775, + "loss": 0.79381645, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.28491211, + "step": 2742, + "time_per_iteration": 2.6370468139648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_mlp": 1.05685973, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.08133754904485412, + "language_loss": 0.88532221, + "learning_rate": 0.00047944400256284754, + "loss": 0.89617908, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.28808594, + "step": 2743, + "time_per_iteration": 2.6988437175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05504286, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.061354637447893066, + "language_loss": 0.8008759, + "learning_rate": 0.0004791327276584532, + "loss": 0.81171608, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.28930664, + "step": 2744, + "time_per_iteration": 2.843850612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092207, + "balance_loss_mlp": 1.0627383, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.06451817982099761, + "language_loss": 0.80512536, + "learning_rate": 0.00047882146085545264, + "loss": 0.81604743, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.29418945, + "step": 2745, + "time_per_iteration": 2.6313765048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059727, + "balance_loss_mlp": 1.04713857, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.01846816151842821, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76462114, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12597656, + "step": 2746, + "time_per_iteration": 4.961829662322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080481, + "balance_loss_mlp": 1.05105972, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.06475941859576588, + "language_loss": 0.79224515, + "learning_rate": 0.00047819895203700684, + "loss": 0.80304992, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29394531, + "step": 2747, + "time_per_iteration": 2.727640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_mlp": 1.03618371, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.01378573653182101, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76561111, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.70350980758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074595, + "balance_loss_mlp": 1.04469705, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.06074589131451646, + "language_loss": 0.88260013, + "learning_rate": 0.0004775764770742277, + "loss": 0.89334607, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29907227, + "step": 2749, + "time_per_iteration": 2.8722305297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.05064785, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.1215004440050613, + "language_loss": 0.86453164, + "learning_rate": 0.00047726525259079777, + "loss": 0.8753407, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.30224609, + "step": 2750, + "time_per_iteration": 2.782618522644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082004, + "balance_loss_mlp": 1.05203521, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.07030365944612293, + "language_loss": 0.88707, + "learning_rate": 0.0004769540369337798, + "loss": 0.89789003, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.29931641, + "step": 2751, + "time_per_iteration": 2.7570507526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078279, + "balance_loss_mlp": 1.04792809, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.06134745452443849, + "language_loss": 0.86018121, + "learning_rate": 0.00047664283022399794, + "loss": 0.87096399, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.3034668, + "step": 2752, + "time_per_iteration": 2.8683836460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070772, + "balance_loss_mlp": 1.04101765, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.061305381303338104, + "language_loss": 0.80927074, + "learning_rate": 0.00047633163258227376, + "loss": 0.81997848, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.29711914, + "step": 2753, + "time_per_iteration": 2.889761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080468, + "balance_loss_mlp": 1.05040383, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.06040690928097006, + "language_loss": 0.85472161, + "learning_rate": 0.0004760204441294247, + "loss": 0.86552632, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.30004883, + "step": 2754, + "time_per_iteration": 2.7022712230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078457, + "balance_loss_mlp": 1.04736757, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.08887078297019954, + "language_loss": 0.85966748, + "learning_rate": 0.00047570926498626486, + "loss": 0.87045205, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31054688, + "step": 2755, + "time_per_iteration": 2.694779396057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083154, + "balance_loss_mlp": 1.05130148, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0527518505260492, + "language_loss": 0.8147307, + "learning_rate": 0.00047539809527360474, + "loss": 0.82556224, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31835938, + "step": 2756, + "time_per_iteration": 2.8726418018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086344, + "balance_loss_mlp": 1.05418181, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.05719732969355854, + "language_loss": 0.82233423, + "learning_rate": 0.0004750869351122511, + "loss": 0.83319771, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.32128906, + "step": 2757, + "time_per_iteration": 2.989522933959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086301, + "balance_loss_mlp": 1.05397129, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0731965335963944, + "language_loss": 0.81977046, + "learning_rate": 0.00047477578462300685, + "loss": 0.83063352, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.32324219, + "step": 2758, + "time_per_iteration": 2.7154197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108253, + "balance_loss_mlp": 1.05153537, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.05716072116198451, + "language_loss": 0.79401624, + "learning_rate": 0.0004744646439266718, + "loss": 0.80484152, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.30957031, + "step": 2759, + "time_per_iteration": 3.010188102722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087952, + "balance_loss_mlp": 1.05719638, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.06513852008932475, + "language_loss": 0.92120409, + "learning_rate": 0.000474153513144041, + "loss": 0.93208361, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.30712891, + "step": 2760, + "time_per_iteration": 2.9100866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090471, + "balance_loss_mlp": 1.05878544, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.05916855301127547, + "language_loss": 0.8678081, + "learning_rate": 0.00047384239239590633, + "loss": 0.87871277, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.31665039, + "step": 2761, + "time_per_iteration": 2.8746495246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108692, + "balance_loss_mlp": 1.05516267, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.06020342742423831, + "language_loss": 0.88611233, + "learning_rate": 0.0004735312818030556, + "loss": 0.8969816, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.31738281, + "step": 2762, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092394, + "balance_loss_mlp": 1.06101847, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.05825845223399112, + "language_loss": 0.82783639, + "learning_rate": 0.0004732201814862727, + "loss": 0.83876032, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31347656, + "step": 2763, + "time_per_iteration": 2.7706046104431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05740237, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.056446972258987926, + "language_loss": 0.81703943, + "learning_rate": 0.0004729090915663373, + "loss": 0.82791865, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.3046875, + "step": 2764, + "time_per_iteration": 2.8320751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_mlp": 1.0584892, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06421691072563727, + "language_loss": 0.85022444, + "learning_rate": 0.00047259801216402534, + "loss": 0.86110902, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.29931641, + "step": 2765, + "time_per_iteration": 2.5070557594299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087661, + "balance_loss_mlp": 1.05735779, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06743519703895742, + "language_loss": 0.86185229, + "learning_rate": 0.00047228694340010845, + "loss": 0.87272882, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.30249023, + "step": 2766, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089224, + "balance_loss_mlp": 1.05918312, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.057283919540088275, + "language_loss": 0.85907435, + "learning_rate": 0.0004719758853953544, + "loss": 0.86996663, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.29980469, + "step": 2767, + "time_per_iteration": 3.598590850830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093331, + "balance_loss_mlp": 1.06419635, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.07956086058885692, + "language_loss": 0.83881301, + "learning_rate": 0.00047166483827052645, + "loss": 0.84974635, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.29125977, + "step": 2768, + "time_per_iteration": 2.4224319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105739, + "balance_loss_mlp": 1.04441977, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.033276153146473426, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78135878, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.12988281, + "step": 2769, + "time_per_iteration": 4.992494583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05961394, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.06372002073291465, + "language_loss": 0.8365072, + "learning_rate": 0.000471042777143682, + "loss": 0.84740394, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.30029297, + "step": 2770, + "time_per_iteration": 3.214010715484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091808, + "balance_loss_mlp": 1.06255412, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.05770492360265134, + "language_loss": 0.79306901, + "learning_rate": 0.0004707317633831707, + "loss": 0.80398703, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.29223633, + "step": 2771, + "time_per_iteration": 2.5814082622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090013, + "balance_loss_mlp": 1.06035328, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.06429055642690477, + "language_loss": 0.78255731, + "learning_rate": 0.00047042076098559673, + "loss": 0.79345745, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.29614258, + "step": 2772, + "time_per_iteration": 2.626574754714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096839, + "balance_loss_mlp": 1.06763303, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.06567346515998468, + "language_loss": 0.73814428, + "learning_rate": 0.00047010977007170174, + "loss": 0.74911261, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.29150391, + "step": 2773, + "time_per_iteration": 3.2639098167419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_mlp": 1.06039929, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06353427502994992, + "language_loss": 0.82705283, + "learning_rate": 0.00046979879076222334, + "loss": 0.83795249, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.29516602, + "step": 2774, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.0655148, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.051161955256212054, + "language_loss": 0.84535086, + "learning_rate": 0.0004694878231778939, + "loss": 0.8562938, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.28759766, + "step": 2775, + "time_per_iteration": 3.37555193901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094093, + "balance_loss_mlp": 1.06471944, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.05222814179658164, + "language_loss": 0.8401432, + "learning_rate": 0.0004691768674394423, + "loss": 0.85108411, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.29321289, + "step": 2776, + "time_per_iteration": 2.992685317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_mlp": 1.01251328, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.010305238226800423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85508353, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.11816406, + "step": 2777, + "time_per_iteration": 4.753941059112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021329, + "balance_loss_mlp": 1.00950325, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.008050007723784799, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77674866, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.11816406, + "step": 2778, + "time_per_iteration": 4.980912923812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.0625428, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.05741424367086941, + "language_loss": 0.79571807, + "learning_rate": 0.00046824407250656676, + "loss": 0.80663168, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.28808594, + "step": 2779, + "time_per_iteration": 2.641680955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109255, + "balance_loss_mlp": 1.06303382, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.05780417685778494, + "language_loss": 0.83320916, + "learning_rate": 0.0004679331653588161, + "loss": 0.84413469, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.29467773, + "step": 2780, + "time_per_iteration": 2.6292784214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_mlp": 1.05741477, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07200473336731207, + "language_loss": 0.8539027, + "learning_rate": 0.0004676222706605147, + "loss": 0.86477172, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.29467773, + "step": 2781, + "time_per_iteration": 2.633302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082924, + "balance_loss_mlp": 1.05355036, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.06052388593462891, + "language_loss": 0.85071301, + "learning_rate": 0.0004673113885323626, + "loss": 0.86154234, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.29321289, + "step": 2782, + "time_per_iteration": 2.8385848999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108118, + "balance_loss_mlp": 1.05152082, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04759682065371887, + "language_loss": 0.78464407, + "learning_rate": 0.00046700051909505494, + "loss": 0.79545587, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.29638672, + "step": 2783, + "time_per_iteration": 3.17055344581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087683, + "balance_loss_mlp": 1.05730867, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06917760310735488, + "language_loss": 0.83446693, + "learning_rate": 0.000466689662469282, + "loss": 0.84534377, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.3034668, + "step": 2784, + "time_per_iteration": 2.6696882247924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080736, + "balance_loss_mlp": 1.05048084, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.0647182284961505, + "language_loss": 0.84010589, + "learning_rate": 0.00046637881877572917, + "loss": 0.85091329, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.30200195, + "step": 2785, + "time_per_iteration": 3.0897059440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107764, + "balance_loss_mlp": 1.04783738, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.2060352755327757, + "language_loss": 0.84354532, + "learning_rate": 0.0004660679881350764, + "loss": 0.85432178, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.29736328, + "step": 2786, + "time_per_iteration": 2.763195753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_mlp": 1.0236131, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.018061436986608354, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76645112, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.13378906, + "step": 2787, + "time_per_iteration": 5.074235677719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.05223989, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0731464482403051, + "language_loss": 0.77922016, + "learning_rate": 0.0004654463664951667, + "loss": 0.79004586, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.30273438, + "step": 2788, + "time_per_iteration": 2.9973762035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086105, + "balance_loss_mlp": 1.05647016, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.06405642217776768, + "language_loss": 0.83215284, + "learning_rate": 0.0004651355757372447, + "loss": 0.84301388, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.2956543, + "step": 2789, + "time_per_iteration": 2.677021026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.05955315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.05726084062519834, + "language_loss": 0.85958302, + "learning_rate": 0.00046482479851489274, + "loss": 0.87048161, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.30273438, + "step": 2790, + "time_per_iteration": 2.6652121543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.05933237, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.07271669587233448, + "language_loss": 0.77731752, + "learning_rate": 0.00046451403494876525, + "loss": 0.78821647, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.30541992, + "step": 2791, + "time_per_iteration": 2.897798776626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090037, + "balance_loss_mlp": 1.05882847, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.06591879115648011, + "language_loss": 0.84175646, + "learning_rate": 0.0004642032851595111, + "loss": 0.8526569, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.31176758, + "step": 2792, + "time_per_iteration": 2.758230209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086262, + "balance_loss_mlp": 1.05543458, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05973481987913333, + "language_loss": 0.84753001, + "learning_rate": 0.00046389254926777404, + "loss": 0.8583926, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.30810547, + "step": 2793, + "time_per_iteration": 2.7933902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086495, + "balance_loss_mlp": 1.05562031, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05136203618868989, + "language_loss": 0.7824527, + "learning_rate": 0.0004635818273941926, + "loss": 0.79331762, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.30859375, + "step": 2794, + "time_per_iteration": 3.564011335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088501, + "balance_loss_mlp": 1.05786383, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.06685314707582615, + "language_loss": 0.81738025, + "learning_rate": 0.0004632711196593997, + "loss": 0.82826525, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.30639648, + "step": 2795, + "time_per_iteration": 2.7609026432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089037, + "balance_loss_mlp": 1.05882931, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.06695327911218095, + "language_loss": 0.85338485, + "learning_rate": 0.00046296042618402297, + "loss": 0.86427522, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.30175781, + "step": 2796, + "time_per_iteration": 3.079580783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.05344939, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.05461778050704968, + "language_loss": 0.79521048, + "learning_rate": 0.0004626497470886839, + "loss": 0.80605042, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30517578, + "step": 2797, + "time_per_iteration": 2.956915855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_mlp": 1.0549171, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.05348634251654363, + "language_loss": 0.81572765, + "learning_rate": 0.00046233908249399897, + "loss": 0.82658887, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.31176758, + "step": 2798, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087806, + "balance_loss_mlp": 1.05781281, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.07296004689367808, + "language_loss": 0.78106725, + "learning_rate": 0.00046202843252057905, + "loss": 0.79194534, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.29956055, + "step": 2799, + "time_per_iteration": 2.615086317062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.05522037, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.056459019467486986, + "language_loss": 0.83738667, + "learning_rate": 0.00046171779728902896, + "loss": 0.84824288, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.3034668, + "step": 2800, + "time_per_iteration": 2.613084077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05025029, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.07411133953793157, + "language_loss": 0.86239338, + "learning_rate": 0.000461407176919948, + "loss": 0.87320936, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.31323242, + "step": 2801, + "time_per_iteration": 2.5331709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078309, + "balance_loss_mlp": 1.04838777, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.07244428600451569, + "language_loss": 0.85469061, + "learning_rate": 0.00046109657153392997, + "loss": 0.86547375, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.29858398, + "step": 2802, + "time_per_iteration": 2.7376809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081766, + "balance_loss_mlp": 1.05007982, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.06487466420670769, + "language_loss": 0.82949483, + "learning_rate": 0.0004607859812515622, + "loss": 0.84031248, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.31665039, + "step": 2803, + "time_per_iteration": 2.601752996444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078317, + "balance_loss_mlp": 1.0476799, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06325281802882306, + "language_loss": 0.87643886, + "learning_rate": 0.00046047540619342667, + "loss": 0.88722193, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.3059082, + "step": 2804, + "time_per_iteration": 2.6036136150360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080625, + "balance_loss_mlp": 1.05056071, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.0581751577303043, + "language_loss": 0.80008459, + "learning_rate": 0.00046016484648009933, + "loss": 0.81089091, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30004883, + "step": 2805, + "time_per_iteration": 2.713219165802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05105305, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.057792621829283776, + "language_loss": 0.80917501, + "learning_rate": 0.0004598543022321501, + "loss": 0.81997907, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.29296875, + "step": 2806, + "time_per_iteration": 2.631939172744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082616, + "balance_loss_mlp": 1.05281353, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.07612886672081497, + "language_loss": 0.79604518, + "learning_rate": 0.0004595437735701433, + "loss": 0.80687129, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.29736328, + "step": 2807, + "time_per_iteration": 2.701808214187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.0507021, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.07694205416949251, + "language_loss": 0.83500147, + "learning_rate": 0.00045923326061463623, + "loss": 0.84581584, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.30688477, + "step": 2808, + "time_per_iteration": 2.7844398021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078771, + "balance_loss_mlp": 1.04725254, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.07660553916433042, + "language_loss": 0.81710881, + "learning_rate": 0.00045892276348618113, + "loss": 0.82789654, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.31494141, + "step": 2809, + "time_per_iteration": 2.982339859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053757, + "balance_loss_mlp": 1.04088223, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.023591100709610114, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.7931459, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12890625, + "step": 2810, + "time_per_iteration": 5.077887296676636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086772, + "balance_loss_mlp": 1.05580163, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.07053414384060859, + "language_loss": 0.80792511, + "learning_rate": 0.000458301817192603, + "loss": 0.81879282, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.30957031, + "step": 2811, + "time_per_iteration": 2.8369667530059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_mlp": 1.02586305, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.019629272648215536, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81880522, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12890625, + "step": 2812, + "time_per_iteration": 4.8166663646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079133, + "balance_loss_mlp": 1.04790044, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.05474211885389724, + "language_loss": 0.86781704, + "learning_rate": 0.00045768093565369983, + "loss": 0.87860835, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31201172, + "step": 2813, + "time_per_iteration": 2.7311370372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081245, + "balance_loss_mlp": 1.05077481, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05950457911446913, + "language_loss": 0.8158434, + "learning_rate": 0.0004573705194685646, + "loss": 0.82665586, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.30444336, + "step": 2814, + "time_per_iteration": 2.733198404312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_mlp": 1.0498848, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.06917969261153488, + "language_loss": 0.84880143, + "learning_rate": 0.00045706011983366157, + "loss": 0.85961473, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.31420898, + "step": 2815, + "time_per_iteration": 2.6939895153045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.04683733, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.08149095023345422, + "language_loss": 0.82716835, + "learning_rate": 0.00045674973686949847, + "loss": 0.83794552, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.30834961, + "step": 2816, + "time_per_iteration": 2.532838821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.045784, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.06493873134640445, + "language_loss": 0.85336345, + "learning_rate": 0.0004564393706965766, + "loss": 0.86413169, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 3.013608455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.04578137, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.06666383117391396, + "language_loss": 0.81068963, + "learning_rate": 0.00045612902143539116, + "loss": 0.82146215, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31469727, + "step": 2818, + "time_per_iteration": 2.605372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070647, + "balance_loss_mlp": 1.03998637, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.07813750406706815, + "language_loss": 0.81324685, + "learning_rate": 0.00045581868920642986, + "loss": 0.82395327, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.30615234, + "step": 2819, + "time_per_iteration": 2.4960100650787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.04709649, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.07920473504276467, + "language_loss": 0.79243749, + "learning_rate": 0.00045550837413017457, + "loss": 0.80321598, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30712891, + "step": 2820, + "time_per_iteration": 2.684987783432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072493, + "balance_loss_mlp": 1.04188037, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.056801171387635116, + "language_loss": 0.85060829, + "learning_rate": 0.0004551980763271005, + "loss": 0.86133325, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30566406, + "step": 2821, + "time_per_iteration": 2.6912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075835, + "balance_loss_mlp": 1.04529333, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.05882616642734503, + "language_loss": 0.83789319, + "learning_rate": 0.0004548877959176756, + "loss": 0.84865159, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.30493164, + "step": 2822, + "time_per_iteration": 2.8441174030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080776, + "balance_loss_mlp": 1.04985332, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.06945933761570218, + "language_loss": 0.86118329, + "learning_rate": 0.00045457753302236166, + "loss": 0.8719911, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30908203, + "step": 2823, + "time_per_iteration": 2.6186442375183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107393, + "balance_loss_mlp": 1.04312599, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.07165023342281863, + "language_loss": 0.87164384, + "learning_rate": 0.00045426728776161353, + "loss": 0.88238311, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30761719, + "step": 2824, + "time_per_iteration": 2.7953178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.05092704, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.05974352124313591, + "language_loss": 0.81803101, + "learning_rate": 0.00045395706025587863, + "loss": 0.8288421, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.30151367, + "step": 2825, + "time_per_iteration": 2.612980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076561, + "balance_loss_mlp": 1.04599547, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.07443979134593931, + "language_loss": 0.8264693, + "learning_rate": 0.00045364685062559843, + "loss": 0.83723497, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30541992, + "step": 2826, + "time_per_iteration": 2.828479051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04630804, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.061142502150282975, + "language_loss": 0.91168308, + "learning_rate": 0.0004533366589912067, + "loss": 0.92245257, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.30615234, + "step": 2827, + "time_per_iteration": 2.970296621322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075368, + "balance_loss_mlp": 1.04599524, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.07414497131093437, + "language_loss": 0.77502602, + "learning_rate": 0.0004530264854731306, + "loss": 0.78577971, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29370117, + "step": 2828, + "time_per_iteration": 3.022944450378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05521488, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.048879345895653556, + "language_loss": 0.84054667, + "learning_rate": 0.00045271633019179034, + "loss": 0.85139751, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.29833984, + "step": 2829, + "time_per_iteration": 2.7760679721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086373, + "balance_loss_mlp": 1.05707121, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.06402410848819869, + "language_loss": 0.87688053, + "learning_rate": 0.0004524061932675986, + "loss": 0.88774425, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.29248047, + "step": 2830, + "time_per_iteration": 2.830350637435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_mlp": 1.05691731, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.06453180665575306, + "language_loss": 0.86766136, + "learning_rate": 0.00045209607482096125, + "loss": 0.87853098, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.30029297, + "step": 2831, + "time_per_iteration": 3.0085608959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082113, + "balance_loss_mlp": 1.05192947, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.06460698711812493, + "language_loss": 0.84066617, + "learning_rate": 0.0004517859749722772, + "loss": 0.85148734, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.30126953, + "step": 2832, + "time_per_iteration": 2.6471612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.04803348, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.09569427913676506, + "language_loss": 0.78785688, + "learning_rate": 0.0004514758938419376, + "loss": 0.79863977, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.30200195, + "step": 2833, + "time_per_iteration": 2.8068594932556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_mlp": 1.02627981, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.016706116470577157, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77958739, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.11865234, + "step": 2834, + "time_per_iteration": 4.907236814498901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.04871142, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.06561437539450005, + "language_loss": 0.83799005, + "learning_rate": 0.00045085578821782175, + "loss": 0.84878516, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.30761719, + "step": 2835, + "time_per_iteration": 2.538837194442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_mlp": 1.02082336, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.016611239115941395, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77167535, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.11962891, + "step": 2836, + "time_per_iteration": 4.947264671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.04765117, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.05618000101860937, + "language_loss": 0.8099249, + "learning_rate": 0.00045023575891159866, + "loss": 0.82071036, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30859375, + "step": 2837, + "time_per_iteration": 2.7390823364257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_mlp": 1.01348448, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.010465474292049673, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75789356, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.12060547, + "step": 2838, + "time_per_iteration": 4.913767576217651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_mlp": 1.05025697, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.053509390521789255, + "language_loss": 0.78084177, + "learning_rate": 0.0004496158068861354, + "loss": 0.7916435, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29882812, + "step": 2839, + "time_per_iteration": 2.816080331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_mlp": 1.05548143, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.05135655646470402, + "language_loss": 0.80302298, + "learning_rate": 0.00044930586015455207, + "loss": 0.81387937, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.30102539, + "step": 2840, + "time_per_iteration": 2.79626727104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087336, + "balance_loss_mlp": 1.05717611, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.05566707414242676, + "language_loss": 0.89057064, + "learning_rate": 0.000448995933104179, + "loss": 0.90144402, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.30102539, + "step": 2841, + "time_per_iteration": 2.8602969646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080566, + "balance_loss_mlp": 1.0502634, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.07080900039808569, + "language_loss": 0.80240697, + "learning_rate": 0.00044868602585534077, + "loss": 0.81321263, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.30297852, + "step": 2842, + "time_per_iteration": 2.9035747051239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078755, + "balance_loss_mlp": 1.04778409, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.061738359719804514, + "language_loss": 0.88582397, + "learning_rate": 0.0004483761385283541, + "loss": 0.89661151, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.30932617, + "step": 2843, + "time_per_iteration": 2.5193030834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074267, + "balance_loss_mlp": 1.04448807, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05447472334615201, + "language_loss": 0.81464523, + "learning_rate": 0.0004480662712435281, + "loss": 0.8253879, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.29736328, + "step": 2844, + "time_per_iteration": 2.731069326400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107206, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.060615817798691185, + "language_loss": 0.8824929, + "learning_rate": 0.0004477564241211635, + "loss": 0.89321351, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.29467773, + "step": 2845, + "time_per_iteration": 2.5875682830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079224, + "balance_loss_mlp": 1.04880142, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.0822753996114188, + "language_loss": 0.86914051, + "learning_rate": 0.0004474465972815541, + "loss": 0.87993276, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.30371094, + "step": 2846, + "time_per_iteration": 2.4777207374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074275, + "balance_loss_mlp": 1.04406786, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.05432348028770475, + "language_loss": 0.87747157, + "learning_rate": 0.000447136790844985, + "loss": 0.88821435, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.30151367, + "step": 2847, + "time_per_iteration": 2.6856186389923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04623675, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.055626256163384374, + "language_loss": 0.81023288, + "learning_rate": 0.00044682700493173385, + "loss": 0.8210023, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.30664062, + "step": 2848, + "time_per_iteration": 2.8167617321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082333, + "balance_loss_mlp": 1.05229259, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.06111415202222153, + "language_loss": 0.80075896, + "learning_rate": 0.00044651723966207004, + "loss": 0.81158233, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.29980469, + "step": 2849, + "time_per_iteration": 3.0959999561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084207, + "balance_loss_mlp": 1.05435705, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.05903862339795778, + "language_loss": 0.78441715, + "learning_rate": 0.00044620749515625536, + "loss": 0.79525924, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.2980957, + "step": 2850, + "time_per_iteration": 2.7892706394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.05001831, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.0673362889441577, + "language_loss": 0.84918725, + "learning_rate": 0.00044589777153454334, + "loss": 0.85998976, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30175781, + "step": 2851, + "time_per_iteration": 2.771003007888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083219, + "balance_loss_mlp": 1.05241561, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05413608872240749, + "language_loss": 0.83428276, + "learning_rate": 0.00044558806891717895, + "loss": 0.84511489, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30761719, + "step": 2852, + "time_per_iteration": 2.499460220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088115, + "balance_loss_mlp": 1.0584085, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.06786065051926819, + "language_loss": 0.79808474, + "learning_rate": 0.0004452783874243998, + "loss": 0.80896592, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.29663086, + "step": 2853, + "time_per_iteration": 2.8307228088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084659, + "balance_loss_mlp": 1.05497599, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06292410009946192, + "language_loss": 0.84795368, + "learning_rate": 0.00044496872717643475, + "loss": 0.85880023, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.29638672, + "step": 2854, + "time_per_iteration": 2.6626110076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.03819215, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.03322747605543158, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78140646, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.13183594, + "step": 2855, + "time_per_iteration": 4.957303285598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.05448246, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.04982994122271322, + "language_loss": 0.81768692, + "learning_rate": 0.0004443494708958217, + "loss": 0.82852638, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.29443359, + "step": 2856, + "time_per_iteration": 3.005343437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088352, + "balance_loss_mlp": 1.0585736, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.04689474861444355, + "language_loss": 0.80522525, + "learning_rate": 0.0004440398751035906, + "loss": 0.8161087, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29736328, + "step": 2857, + "time_per_iteration": 2.868595838546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095367, + "balance_loss_mlp": 1.06659007, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07030492887566664, + "language_loss": 0.83409548, + "learning_rate": 0.00044373030103700645, + "loss": 0.8450492, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.28759766, + "step": 2858, + "time_per_iteration": 2.5910122394561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094102, + "balance_loss_mlp": 1.06508696, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.06946154028242445, + "language_loss": 0.79413795, + "learning_rate": 0.000443420748816257, + "loss": 0.80507904, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28979492, + "step": 2859, + "time_per_iteration": 2.825594663619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06706619, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.06600867884275338, + "language_loss": 0.78576386, + "learning_rate": 0.0004431112185615208, + "loss": 0.79672724, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.29248047, + "step": 2860, + "time_per_iteration": 2.786670446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090723, + "balance_loss_mlp": 1.06154037, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.06889565209263777, + "language_loss": 0.79788846, + "learning_rate": 0.00044280171039296845, + "loss": 0.80879569, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29174805, + "step": 2861, + "time_per_iteration": 2.634674072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.0620054, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.05438680375258401, + "language_loss": 0.88480103, + "learning_rate": 0.0004424922244307616, + "loss": 0.89570987, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.28857422, + "step": 2862, + "time_per_iteration": 2.6849331855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093044, + "balance_loss_mlp": 1.06328964, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06984640427248112, + "language_loss": 0.81865609, + "learning_rate": 0.00044218276079505315, + "loss": 0.82958651, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.29711914, + "step": 2863, + "time_per_iteration": 2.9186837673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.06289792, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.06524866768544495, + "language_loss": 0.74926496, + "learning_rate": 0.0004418733196059876, + "loss": 0.76019078, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29663086, + "step": 2864, + "time_per_iteration": 2.74560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_mlp": 1.05635333, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.056184402553186, + "language_loss": 0.79785758, + "learning_rate": 0.0004415639009837008, + "loss": 0.80870748, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28637695, + "step": 2865, + "time_per_iteration": 2.81969952583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087597, + "balance_loss_mlp": 1.05908251, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.061494004909324176, + "language_loss": 0.81620675, + "learning_rate": 0.00044125450504831955, + "loss": 0.82708275, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.28540039, + "step": 2866, + "time_per_iteration": 2.739954948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085385, + "balance_loss_mlp": 1.05586863, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.07127737838687996, + "language_loss": 0.81880403, + "learning_rate": 0.0004409451319199622, + "loss": 0.82965791, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.29467773, + "step": 2867, + "time_per_iteration": 2.6776282787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.0484705, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.06535442843844029, + "language_loss": 0.84516299, + "learning_rate": 0.0004406357817187381, + "loss": 0.85593313, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.28540039, + "step": 2868, + "time_per_iteration": 3.002542495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05170417, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.05667738365358171, + "language_loss": 0.81411439, + "learning_rate": 0.0004403264545647474, + "loss": 0.82492542, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29370117, + "step": 2869, + "time_per_iteration": 3.523195505142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080839, + "balance_loss_mlp": 1.05196702, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.062383704003679354, + "language_loss": 0.8429901, + "learning_rate": 0.00044001715057808154, + "loss": 0.85379851, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.28808594, + "step": 2870, + "time_per_iteration": 2.759244680404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_mlp": 1.05496836, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05408626919612749, + "language_loss": 0.81631571, + "learning_rate": 0.0004397078698788232, + "loss": 0.82716751, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.30175781, + "step": 2871, + "time_per_iteration": 3.2238638401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_mlp": 1.0167197, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.017765030651381717, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81471765, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12695312, + "step": 2872, + "time_per_iteration": 4.941680431365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084518, + "balance_loss_mlp": 1.05442953, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06021715836391359, + "language_loss": 0.77858603, + "learning_rate": 0.00043908937882281343, + "loss": 0.78943121, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.30029297, + "step": 2873, + "time_per_iteration": 2.6475777626037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_mlp": 1.04845667, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05779342240658392, + "language_loss": 0.82503784, + "learning_rate": 0.0004387801687061814, + "loss": 0.83582854, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.30566406, + "step": 2874, + "time_per_iteration": 2.8554017543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078914, + "balance_loss_mlp": 1.04963589, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.0636526113513214, + "language_loss": 0.80157411, + "learning_rate": 0.0004384709823571958, + "loss": 0.81236321, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.29223633, + "step": 2875, + "time_per_iteration": 2.749535322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076752, + "balance_loss_mlp": 1.04764128, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06015536663517987, + "language_loss": 0.82898968, + "learning_rate": 0.0004381618198958932, + "loss": 0.8397572, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.29052734, + "step": 2876, + "time_per_iteration": 3.518888235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0494318, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05611364502947972, + "language_loss": 0.83295852, + "learning_rate": 0.00043785268144230137, + "loss": 0.84374702, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.29418945, + "step": 2877, + "time_per_iteration": 2.8977479934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078991, + "balance_loss_mlp": 1.04916453, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.07334940017367843, + "language_loss": 0.82020825, + "learning_rate": 0.00043754356711643837, + "loss": 0.83099812, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29785156, + "step": 2878, + "time_per_iteration": 2.6804401874542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080304, + "balance_loss_mlp": 1.04964316, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.0625181232423103, + "language_loss": 0.84172422, + "learning_rate": 0.0004372344770383132, + "loss": 0.85252726, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30615234, + "step": 2879, + "time_per_iteration": 2.80837345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04766345, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.05711228581787917, + "language_loss": 0.82837629, + "learning_rate": 0.00043692541132792507, + "loss": 0.83915067, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29736328, + "step": 2880, + "time_per_iteration": 2.7545833587646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04738569, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.06446598855551679, + "language_loss": 0.83125883, + "learning_rate": 0.00043661637010526384, + "loss": 0.84202665, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.29370117, + "step": 2881, + "time_per_iteration": 2.4907724857330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072171, + "balance_loss_mlp": 1.04139102, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.05841414515956175, + "language_loss": 0.82957321, + "learning_rate": 0.00043630735349031025, + "loss": 0.8402949, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30737305, + "step": 2882, + "time_per_iteration": 2.6922152042388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071624, + "balance_loss_mlp": 1.04101133, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.05422763519754927, + "language_loss": 0.81816816, + "learning_rate": 0.00043599836160303495, + "loss": 0.82888442, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.30566406, + "step": 2883, + "time_per_iteration": 2.861325979232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069587, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05987077775612136, + "language_loss": 0.77311337, + "learning_rate": 0.0004356893945633995, + "loss": 0.78380919, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.30395508, + "step": 2884, + "time_per_iteration": 2.964421510696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070587, + "balance_loss_mlp": 1.03930664, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.16390384373312603, + "language_loss": 0.81600153, + "learning_rate": 0.0004353804524913551, + "loss": 0.82670736, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.3125, + "step": 2885, + "time_per_iteration": 2.6043736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068449, + "balance_loss_mlp": 1.03721642, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.06199045057720987, + "language_loss": 0.81625175, + "learning_rate": 0.0004350715355068441, + "loss": 0.82693619, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.31225586, + "step": 2886, + "time_per_iteration": 2.7229857444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072103, + "balance_loss_mlp": 1.04051256, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06868325666686464, + "language_loss": 0.79814357, + "learning_rate": 0.00043476264372979847, + "loss": 0.80886459, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.31567383, + "step": 2887, + "time_per_iteration": 2.5191705226898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071885, + "balance_loss_mlp": 1.0417012, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.07224884026335429, + "language_loss": 0.78504527, + "learning_rate": 0.0004344537772801408, + "loss": 0.79576409, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.30151367, + "step": 2888, + "time_per_iteration": 3.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_mlp": 1.02040219, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.021049912274883148, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74454963, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12109375, + "step": 2889, + "time_per_iteration": 4.967891216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.04613566, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.06601593716549485, + "language_loss": 0.83441556, + "learning_rate": 0.0004338361208426298, + "loss": 0.84519023, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.31298828, + "step": 2890, + "time_per_iteration": 2.6076786518096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_mlp": 1.0466727, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.05044338716051736, + "language_loss": 0.81248903, + "learning_rate": 0.00043352733109457164, + "loss": 0.82326382, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.30761719, + "step": 2891, + "time_per_iteration": 2.893113136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081411, + "balance_loss_mlp": 1.05148911, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.05185548617134015, + "language_loss": 0.84650671, + "learning_rate": 0.00043321856715349244, + "loss": 0.8573209, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29907227, + "step": 2892, + "time_per_iteration": 2.9470455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05024242, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.060968656189677554, + "language_loss": 0.80153251, + "learning_rate": 0.00043290982913926466, + "loss": 0.81233752, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.30249023, + "step": 2893, + "time_per_iteration": 2.801114559173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.05283189, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.06077441603872835, + "language_loss": 0.83792776, + "learning_rate": 0.0004326011171717514, + "loss": 0.84875673, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30004883, + "step": 2894, + "time_per_iteration": 2.889112710952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077209, + "balance_loss_mlp": 1.04762125, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.06532751979042353, + "language_loss": 0.81112337, + "learning_rate": 0.0004322924313708051, + "loss": 0.82189548, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.29614258, + "step": 2895, + "time_per_iteration": 2.5237138271331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04895401, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.06395509577189365, + "language_loss": 0.84357458, + "learning_rate": 0.0004319837718562681, + "loss": 0.85435069, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.28686523, + "step": 2896, + "time_per_iteration": 2.6235451698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081945, + "balance_loss_mlp": 1.05123627, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.07087835610959153, + "language_loss": 0.82998407, + "learning_rate": 0.0004316751387479726, + "loss": 0.8408035, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30664062, + "step": 2897, + "time_per_iteration": 2.7460193634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081079, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.06734561564060734, + "language_loss": 0.82601708, + "learning_rate": 0.0004313665321657409, + "loss": 0.83682787, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.29882812, + "step": 2898, + "time_per_iteration": 3.700585126876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083979, + "balance_loss_mlp": 1.05393827, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06408348461050545, + "language_loss": 0.79922706, + "learning_rate": 0.00043105795222938436, + "loss": 0.81006682, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.30004883, + "step": 2899, + "time_per_iteration": 2.785468816757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077879, + "balance_loss_mlp": 1.04776657, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.056878366734987945, + "language_loss": 0.78559703, + "learning_rate": 0.00043074939905870467, + "loss": 0.79637581, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.30078125, + "step": 2900, + "time_per_iteration": 2.6782429218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081281, + "balance_loss_mlp": 1.05157411, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.061480860141572814, + "language_loss": 0.806315, + "learning_rate": 0.0004304408727734927, + "loss": 0.81712782, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.296875, + "step": 2901, + "time_per_iteration": 2.6361851692199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089927, + "balance_loss_mlp": 1.05955291, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.045249909626423154, + "language_loss": 0.88812852, + "learning_rate": 0.0004301323734935288, + "loss": 0.89902782, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.3034668, + "step": 2902, + "time_per_iteration": 2.650801181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_mlp": 1.05541265, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.061039385793722846, + "language_loss": 0.87144208, + "learning_rate": 0.000429823901338583, + "loss": 0.88229275, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.29638672, + "step": 2903, + "time_per_iteration": 2.603729486465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108106, + "balance_loss_mlp": 1.05128181, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.060582508535745275, + "language_loss": 0.86712891, + "learning_rate": 0.00042951545642841513, + "loss": 0.87793946, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.29711914, + "step": 2904, + "time_per_iteration": 3.0844316482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05437517, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.055991570648287706, + "language_loss": 0.86597067, + "learning_rate": 0.0004292070388827737, + "loss": 0.87681645, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.30175781, + "step": 2905, + "time_per_iteration": 2.561948537826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082655, + "balance_loss_mlp": 1.0526619, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.06056202554709599, + "language_loss": 0.80913132, + "learning_rate": 0.00042889864882139753, + "loss": 0.81995785, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.29956055, + "step": 2906, + "time_per_iteration": 2.584385871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088672, + "balance_loss_mlp": 1.05913234, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.05654682862292604, + "language_loss": 0.81697655, + "learning_rate": 0.0004285902863640139, + "loss": 0.82786322, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29516602, + "step": 2907, + "time_per_iteration": 2.598034620285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05342221, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.05788374674587666, + "language_loss": 0.85753977, + "learning_rate": 0.00042828195163033966, + "loss": 0.86837995, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.30566406, + "step": 2908, + "time_per_iteration": 2.654411792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.05099869, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.05647224332708591, + "language_loss": 0.79214805, + "learning_rate": 0.0004279736447400812, + "loss": 0.80296183, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30322266, + "step": 2909, + "time_per_iteration": 2.6054940223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05421579, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05245180641385236, + "language_loss": 0.78436708, + "learning_rate": 0.00042766536581293385, + "loss": 0.79521292, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.3034668, + "step": 2910, + "time_per_iteration": 2.735391139984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_mlp": 1.0553261, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.07209314448313818, + "language_loss": 0.79203892, + "learning_rate": 0.0004273571149685819, + "loss": 0.80289924, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30664062, + "step": 2911, + "time_per_iteration": 2.7689387798309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081503, + "balance_loss_mlp": 1.05234432, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.05523073387542819, + "language_loss": 0.8391124, + "learning_rate": 0.00042704889232669937, + "loss": 0.84992743, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29125977, + "step": 2912, + "time_per_iteration": 2.7328362464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082045, + "balance_loss_mlp": 1.05288625, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.0608748772154565, + "language_loss": 0.85180819, + "learning_rate": 0.0004267406980069484, + "loss": 0.8626287, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29150391, + "step": 2913, + "time_per_iteration": 2.6889522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_mlp": 1.05416012, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.0517518520900543, + "language_loss": 0.79621083, + "learning_rate": 0.0004264325321289808, + "loss": 0.80704308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.2902832, + "step": 2914, + "time_per_iteration": 2.7854018211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_mlp": 1.05145359, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.05874282962966631, + "language_loss": 0.86178029, + "learning_rate": 0.00042612439481243736, + "loss": 0.87259024, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.29516602, + "step": 2915, + "time_per_iteration": 2.7484261989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.05264628, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06045457404054478, + "language_loss": 0.89827836, + "learning_rate": 0.00042581628617694735, + "loss": 0.90910184, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.29663086, + "step": 2916, + "time_per_iteration": 2.7450428009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_mlp": 1.05376196, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.06174360046329572, + "language_loss": 0.81716877, + "learning_rate": 0.0004255082063421296, + "loss": 0.82800722, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.30078125, + "step": 2917, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080705, + "balance_loss_mlp": 1.0505209, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.07215647610626674, + "language_loss": 0.85068524, + "learning_rate": 0.00042520015542759065, + "loss": 0.86149234, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.30151367, + "step": 2918, + "time_per_iteration": 2.838871717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083881, + "balance_loss_mlp": 1.05379248, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.06380613116798055, + "language_loss": 0.88105166, + "learning_rate": 0.00042489213355292687, + "loss": 0.89189053, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.30053711, + "step": 2919, + "time_per_iteration": 2.882988214492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081698, + "balance_loss_mlp": 1.0521102, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05903342570268675, + "language_loss": 0.80986512, + "learning_rate": 0.00042458414083772276, + "loss": 0.82068217, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.29541016, + "step": 2920, + "time_per_iteration": 2.520209550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107915, + "balance_loss_mlp": 1.04829907, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.05182413981421792, + "language_loss": 0.85047603, + "learning_rate": 0.000424276177401552, + "loss": 0.86126757, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.30810547, + "step": 2921, + "time_per_iteration": 2.777956008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.04435039, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.05854064719302618, + "language_loss": 0.85700345, + "learning_rate": 0.0004239682433639763, + "loss": 0.86775458, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.30712891, + "step": 2922, + "time_per_iteration": 2.658231019973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074103, + "balance_loss_mlp": 1.04344249, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.07532891292065343, + "language_loss": 0.85277867, + "learning_rate": 0.0004236603388445467, + "loss": 0.86351973, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.30639648, + "step": 2923, + "time_per_iteration": 2.5820417404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073675, + "balance_loss_mlp": 1.04346776, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05777778027932593, + "language_loss": 0.82139969, + "learning_rate": 0.00042335246396280166, + "loss": 0.83213639, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.30151367, + "step": 2924, + "time_per_iteration": 2.7298922538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06950178029529624, + "language_loss": 0.90437222, + "learning_rate": 0.0004230446188382693, + "loss": 0.9151001, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.30761719, + "step": 2925, + "time_per_iteration": 2.533452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.04133308, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.061159313769390204, + "language_loss": 0.80411077, + "learning_rate": 0.0004227368035904654, + "loss": 0.81483406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.30957031, + "step": 2926, + "time_per_iteration": 2.953749895095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04001379, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.05619049718209651, + "language_loss": 0.82702053, + "learning_rate": 0.00042242901833889474, + "loss": 0.83772445, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30322266, + "step": 2927, + "time_per_iteration": 2.6141388416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079835, + "balance_loss_mlp": 1.04977047, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.06403217415420936, + "language_loss": 0.86264247, + "learning_rate": 0.0004221212632030501, + "loss": 0.8734408, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.30004883, + "step": 2928, + "time_per_iteration": 3.0815889835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079959, + "balance_loss_mlp": 1.04953694, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.0586888061552407, + "language_loss": 0.7995134, + "learning_rate": 0.0004218135383024124, + "loss": 0.81031299, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30395508, + "step": 2929, + "time_per_iteration": 2.7041475772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074718, + "balance_loss_mlp": 1.04417634, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.06027811401713532, + "language_loss": 0.84979665, + "learning_rate": 0.0004215058437564511, + "loss": 0.86054391, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.30493164, + "step": 2930, + "time_per_iteration": 2.5627479553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074654, + "balance_loss_mlp": 1.04427934, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.054381619158741505, + "language_loss": 0.8244099, + "learning_rate": 0.00042119817968462397, + "loss": 0.83515644, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.30322266, + "step": 2931, + "time_per_iteration": 2.5824992656707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076007, + "balance_loss_mlp": 1.04517913, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06458971753482587, + "language_loss": 0.86743045, + "learning_rate": 0.0004208905462063766, + "loss": 0.87819058, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.30786133, + "step": 2932, + "time_per_iteration": 2.6889755725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075474, + "balance_loss_mlp": 1.04447937, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.05636003677155103, + "language_loss": 0.84317416, + "learning_rate": 0.00042058294344114315, + "loss": 0.85392892, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.30957031, + "step": 2933, + "time_per_iteration": 2.626492500305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073066, + "balance_loss_mlp": 1.0428108, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05419859074132438, + "language_loss": 0.77552223, + "learning_rate": 0.0004202753715083456, + "loss": 0.78625292, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.30224609, + "step": 2934, + "time_per_iteration": 3.0855889320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.04767334, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.0600578906837947, + "language_loss": 0.81160748, + "learning_rate": 0.0004199678305273936, + "loss": 0.8223865, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30200195, + "step": 2935, + "time_per_iteration": 2.680676221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072428, + "balance_loss_mlp": 1.04176772, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.07403764487671594, + "language_loss": 0.81138289, + "learning_rate": 0.0004196603206176854, + "loss": 0.8221072, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.30615234, + "step": 2936, + "time_per_iteration": 2.930933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_mlp": 1.05526328, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.06763515513860026, + "language_loss": 0.8344292, + "learning_rate": 0.000419352841898607, + "loss": 0.8452751, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29272461, + "step": 2937, + "time_per_iteration": 2.983389377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04714775, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.06159153322850295, + "language_loss": 0.77355075, + "learning_rate": 0.000419045394489532, + "loss": 0.78431857, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29589844, + "step": 2938, + "time_per_iteration": 2.7125768661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082739, + "balance_loss_mlp": 1.05229306, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.051986884313783496, + "language_loss": 0.76774859, + "learning_rate": 0.0004187379785098224, + "loss": 0.77857602, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.30395508, + "step": 2939, + "time_per_iteration": 3.127896547317505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04854691, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05965997721506439, + "language_loss": 0.83921504, + "learning_rate": 0.00041843059407882744, + "loss": 0.85000205, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.30126953, + "step": 2940, + "time_per_iteration": 2.97220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010812, + "balance_loss_mlp": 1.05113554, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05367108270531433, + "language_loss": 0.82534146, + "learning_rate": 0.0004181232413158842, + "loss": 0.83615345, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.30004883, + "step": 2941, + "time_per_iteration": 2.642336368560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_mlp": 1.05405188, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06412651995290534, + "language_loss": 0.82513189, + "learning_rate": 0.0004178159203403179, + "loss": 0.83596516, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29272461, + "step": 2942, + "time_per_iteration": 2.856449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082217, + "balance_loss_mlp": 1.05260575, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.056771241115104176, + "language_loss": 0.81273901, + "learning_rate": 0.0004175086312714409, + "loss": 0.82356119, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.2956543, + "step": 2943, + "time_per_iteration": 2.62709903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088098, + "balance_loss_mlp": 1.05898714, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.050224853353863855, + "language_loss": 0.83679438, + "learning_rate": 0.00041720137422855366, + "loss": 0.84767538, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.29052734, + "step": 2944, + "time_per_iteration": 2.730576515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_mlp": 1.05710077, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.0578384318096137, + "language_loss": 0.78684467, + "learning_rate": 0.00041689414933094383, + "loss": 0.79770631, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.2902832, + "step": 2945, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_mlp": 1.05483007, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.061631419209263724, + "language_loss": 0.80986917, + "learning_rate": 0.00041658695669788653, + "loss": 0.82071877, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.30102539, + "step": 2946, + "time_per_iteration": 2.766889810562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083037, + "balance_loss_mlp": 1.05352092, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.08686938236765575, + "language_loss": 0.81373537, + "learning_rate": 0.00041627979644864453, + "loss": 0.82456571, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.29467773, + "step": 2947, + "time_per_iteration": 2.7937870025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085685, + "balance_loss_mlp": 1.0563122, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.05686002455066826, + "language_loss": 0.81299067, + "learning_rate": 0.0004159726687024683, + "loss": 0.82384753, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.29345703, + "step": 2948, + "time_per_iteration": 2.636784791946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05417752, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.057207156589959604, + "language_loss": 0.7857877, + "learning_rate": 0.00041566557357859506, + "loss": 0.79662293, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.29321289, + "step": 2949, + "time_per_iteration": 2.8607821464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.05131269, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.050618871180039625, + "language_loss": 0.79166919, + "learning_rate": 0.0004153585111962502, + "loss": 0.802477, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.29443359, + "step": 2950, + "time_per_iteration": 3.306715250015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05387974, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.08196542197504524, + "language_loss": 0.84189069, + "learning_rate": 0.0004150514816746453, + "loss": 0.85272491, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.29492188, + "step": 2951, + "time_per_iteration": 2.6732659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_mlp": 1.05190265, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.06474663434913709, + "language_loss": 0.85581088, + "learning_rate": 0.0004147444851329802, + "loss": 0.86662048, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29003906, + "step": 2952, + "time_per_iteration": 2.647568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079758, + "balance_loss_mlp": 1.05081391, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.0574748240063073, + "language_loss": 0.85410154, + "learning_rate": 0.00041443752169044126, + "loss": 0.8648991, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.28955078, + "step": 2953, + "time_per_iteration": 3.018815040588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081341, + "balance_loss_mlp": 1.05227828, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05380576703697579, + "language_loss": 0.846789, + "learning_rate": 0.0004141305914662025, + "loss": 0.85760248, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.29052734, + "step": 2954, + "time_per_iteration": 2.7356324195861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088016, + "balance_loss_mlp": 1.05807066, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.05392421630137883, + "language_loss": 0.80538452, + "learning_rate": 0.0004138236945794246, + "loss": 0.81626463, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.29907227, + "step": 2955, + "time_per_iteration": 2.8904106616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082907, + "balance_loss_mlp": 1.05439222, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.07320613099583566, + "language_loss": 0.83898306, + "learning_rate": 0.00041351683114925576, + "loss": 0.84981215, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.28491211, + "step": 2956, + "time_per_iteration": 3.0756330490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_mlp": 1.05683398, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.05933823821942172, + "language_loss": 0.86556458, + "learning_rate": 0.0004132100012948308, + "loss": 0.87642407, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.29077148, + "step": 2957, + "time_per_iteration": 2.6803860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.05614674, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.06187903851247569, + "language_loss": 0.84050244, + "learning_rate": 0.00041290320513527145, + "loss": 0.85135645, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.29248047, + "step": 2958, + "time_per_iteration": 2.54225754737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05545211, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04955077863713089, + "language_loss": 0.85089266, + "learning_rate": 0.0004125964427896867, + "loss": 0.86173952, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29199219, + "step": 2959, + "time_per_iteration": 2.716848611831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.0530802, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.0635030186812047, + "language_loss": 0.79277623, + "learning_rate": 0.0004122897143771723, + "loss": 0.80361056, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.30297852, + "step": 2960, + "time_per_iteration": 2.53230357170105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086179, + "balance_loss_mlp": 1.05628169, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.052407613892641675, + "language_loss": 0.81192493, + "learning_rate": 0.0004119830200168109, + "loss": 0.82278675, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.29858398, + "step": 2961, + "time_per_iteration": 2.684126377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.05355775, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.06121192976286501, + "language_loss": 0.88053119, + "learning_rate": 0.0004116763598276714, + "loss": 0.89136672, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.29956055, + "step": 2962, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108181, + "balance_loss_mlp": 1.05138803, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.069996546899228, + "language_loss": 0.8081792, + "learning_rate": 0.00041136973392881017, + "loss": 0.81899732, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.30395508, + "step": 2963, + "time_per_iteration": 2.8093085289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05357933, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.06390032386968057, + "language_loss": 0.8227576, + "learning_rate": 0.00041106314243926983, + "loss": 0.8335923, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.29858398, + "step": 2964, + "time_per_iteration": 2.740004062652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080188, + "balance_loss_mlp": 1.05062366, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.060533570265575896, + "language_loss": 0.87250763, + "learning_rate": 0.0004107565854780798, + "loss": 0.88330954, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29516602, + "step": 2965, + "time_per_iteration": 2.6749136447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080245, + "balance_loss_mlp": 1.05111039, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.06664541213513904, + "language_loss": 0.80888879, + "learning_rate": 0.000410450063164256, + "loss": 0.81969118, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29077148, + "step": 2966, + "time_per_iteration": 2.8448963165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081067, + "balance_loss_mlp": 1.05081153, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.06804112412049489, + "language_loss": 0.82108605, + "learning_rate": 0.00041014357561680115, + "loss": 0.83189678, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30200195, + "step": 2967, + "time_per_iteration": 2.5226550102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_mlp": 1.0544889, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.059986306134107735, + "language_loss": 0.86107051, + "learning_rate": 0.0004098371229547039, + "loss": 0.87191176, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.29589844, + "step": 2968, + "time_per_iteration": 2.7232651710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_mlp": 1.03398585, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.025451731838023718, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81057, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.12207031, + "step": 2969, + "time_per_iteration": 4.785320997238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082869, + "balance_loss_mlp": 1.05330527, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.07178133530641487, + "language_loss": 0.80500889, + "learning_rate": 0.00040922432276247107, + "loss": 0.81583756, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.29516602, + "step": 2970, + "time_per_iteration": 2.5877230167388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086085, + "balance_loss_mlp": 1.05635428, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.05561639186548029, + "language_loss": 0.84452176, + "learning_rate": 0.0004089179754702457, + "loss": 0.85538256, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.29663086, + "step": 2971, + "time_per_iteration": 2.759932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084469, + "balance_loss_mlp": 1.05469072, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.05716809371830958, + "language_loss": 0.79499936, + "learning_rate": 0.00040861166353919843, + "loss": 0.80584407, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.29711914, + "step": 2972, + "time_per_iteration": 2.856147050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080407, + "balance_loss_mlp": 1.05213094, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.054720530113361164, + "language_loss": 0.81279707, + "learning_rate": 0.00040830538708824983, + "loss": 0.82360113, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.28295898, + "step": 2973, + "time_per_iteration": 2.9099643230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05414152, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.059341772904328634, + "language_loss": 0.81557322, + "learning_rate": 0.000407999146236307, + "loss": 0.82641, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29492188, + "step": 2974, + "time_per_iteration": 2.5506579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_mlp": 1.05807054, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.05823834072467256, + "language_loss": 0.8320694, + "learning_rate": 0.0004076929411022634, + "loss": 0.84294319, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.29248047, + "step": 2975, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.05125356, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.059359253337435705, + "language_loss": 0.79102635, + "learning_rate": 0.0004073867718049982, + "loss": 0.80183673, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.29736328, + "step": 2976, + "time_per_iteration": 3.104320526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087781, + "balance_loss_mlp": 1.05745435, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.06002278348442279, + "language_loss": 0.82387239, + "learning_rate": 0.00040708063846337704, + "loss": 0.83475018, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.30273438, + "step": 2977, + "time_per_iteration": 2.7141377925872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.05906403, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.05629415234265891, + "language_loss": 0.81140733, + "learning_rate": 0.00040677454119625143, + "loss": 0.82229173, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.29321289, + "step": 2978, + "time_per_iteration": 2.5579118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.04967451, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.06287623577372331, + "language_loss": 0.82978582, + "learning_rate": 0.0004064684801224587, + "loss": 0.84058082, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.2980957, + "step": 2979, + "time_per_iteration": 2.6184630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080607, + "balance_loss_mlp": 1.05047131, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.049858532305801305, + "language_loss": 0.80364764, + "learning_rate": 0.00040616245536082224, + "loss": 0.81445372, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30078125, + "step": 2980, + "time_per_iteration": 2.605652093887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.04602742, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.05649585275193457, + "language_loss": 0.81399214, + "learning_rate": 0.00040585646703015165, + "loss": 0.82474685, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29418945, + "step": 2981, + "time_per_iteration": 2.8440651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081482, + "balance_loss_mlp": 1.05103636, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.0633133856450646, + "language_loss": 0.78068441, + "learning_rate": 0.0004055505152492419, + "loss": 0.79149926, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.30419922, + "step": 2982, + "time_per_iteration": 2.7125117778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076312, + "balance_loss_mlp": 1.0467, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.057765721767923175, + "language_loss": 0.74208528, + "learning_rate": 0.00040524460013687425, + "loss": 0.75284839, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.29589844, + "step": 2983, + "time_per_iteration": 2.7232775688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05151832, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.049591997410844156, + "language_loss": 0.81157619, + "learning_rate": 0.0004049387218118155, + "loss": 0.82238322, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.29199219, + "step": 2984, + "time_per_iteration": 2.956636428833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080147, + "balance_loss_mlp": 1.04934323, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.06847869877575175, + "language_loss": 0.84987867, + "learning_rate": 0.00040463288039281777, + "loss": 0.8606801, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30761719, + "step": 2985, + "time_per_iteration": 2.7503554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00078201, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.012095267017415088, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78889978, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.12792969, + "step": 2986, + "time_per_iteration": 5.030332565307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079255, + "balance_loss_mlp": 1.04981041, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.055809040190366505, + "language_loss": 0.82136881, + "learning_rate": 0.0004040213087479444, + "loss": 0.83216131, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.29443359, + "step": 2987, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087088, + "balance_loss_mlp": 1.05816782, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.06868722002267488, + "language_loss": 0.85331053, + "learning_rate": 0.0004037155787595018, + "loss": 0.8641814, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.28857422, + "step": 2988, + "time_per_iteration": 2.561497211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085606, + "balance_loss_mlp": 1.05599451, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.05119655910511677, + "language_loss": 0.80321741, + "learning_rate": 0.000403409886151987, + "loss": 0.81407344, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29589844, + "step": 2989, + "time_per_iteration": 2.9114019870758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013296, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.008836939301122537, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83012402, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.12695312, + "step": 2990, + "time_per_iteration": 4.770756483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_mlp": 1.00086439, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.007697309180098509, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79211962, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.125, + "step": 2991, + "time_per_iteration": 4.786288499832153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_mlp": 1.05537939, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05348004588160335, + "language_loss": 0.76926208, + "learning_rate": 0.00040249303380173807, + "loss": 0.78009981, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.28369141, + "step": 2992, + "time_per_iteration": 3.0660438537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_mlp": 1.05629849, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.06048493616630367, + "language_loss": 0.79311389, + "learning_rate": 0.00040218749190459126, + "loss": 0.80396485, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.28808594, + "step": 2993, + "time_per_iteration": 2.7251527309417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084541, + "balance_loss_mlp": 1.05514371, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.0697186971943442, + "language_loss": 0.82477212, + "learning_rate": 0.00040188198798162775, + "loss": 0.83561754, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29370117, + "step": 2994, + "time_per_iteration": 2.6159136295318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05147123, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.057556686362034246, + "language_loss": 0.85848254, + "learning_rate": 0.000401576522151455, + "loss": 0.86929381, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29614258, + "step": 2995, + "time_per_iteration": 2.811438798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05775023, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04540215088386673, + "language_loss": 0.82446247, + "learning_rate": 0.0004012710945326651, + "loss": 0.83532608, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.28613281, + "step": 2996, + "time_per_iteration": 2.778818368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.05790055, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.049519109180824444, + "language_loss": 0.81129038, + "learning_rate": 0.0004009657052438355, + "loss": 0.82215673, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28686523, + "step": 2997, + "time_per_iteration": 2.8787920475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094954, + "balance_loss_mlp": 1.06612968, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.05906428447956742, + "language_loss": 0.85482752, + "learning_rate": 0.00040066035440352904, + "loss": 0.86577708, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.2878418, + "step": 2998, + "time_per_iteration": 2.634565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.03379035, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.021537766013807906, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80338895, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.11962891, + "step": 2999, + "time_per_iteration": 4.964475393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090784, + "balance_loss_mlp": 1.06248331, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.06837432109358414, + "language_loss": 0.75964624, + "learning_rate": 0.00040004976854266145, + "loss": 0.77055407, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.28295898, + "step": 3000, + "time_per_iteration": 2.5489282608032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.06006408, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.0545980885089623, + "language_loss": 0.81222647, + "learning_rate": 0.0003997445337591505, + "loss": 0.82312131, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.29370117, + "step": 3001, + "time_per_iteration": 2.6890947818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108546, + "balance_loss_mlp": 1.05680251, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.06583721131765849, + "language_loss": 0.74093473, + "learning_rate": 0.0003994393378982635, + "loss": 0.75178933, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28662109, + "step": 3002, + "time_per_iteration": 2.596644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_mlp": 1.03153443, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.017943105040569007, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80581129, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11572266, + "step": 3003, + "time_per_iteration": 4.826138257980347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085564, + "balance_loss_mlp": 1.05666792, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.058273014851323426, + "language_loss": 0.87901747, + "learning_rate": 0.0003988290634182961, + "loss": 0.88987309, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.28881836, + "step": 3004, + "time_per_iteration": 2.7604172229766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06015372, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.06327449394997672, + "language_loss": 0.80677181, + "learning_rate": 0.0003985239850361453, + "loss": 0.81765187, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.27856445, + "step": 3005, + "time_per_iteration": 2.5994105339050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.06256592, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.057065414052448256, + "language_loss": 0.84621793, + "learning_rate": 0.0003982189460504777, + "loss": 0.85713327, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.28930664, + "step": 3006, + "time_per_iteration": 2.722778797149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.06261778, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.0654169545720973, + "language_loss": 0.79183024, + "learning_rate": 0.00039791394657971935, + "loss": 0.80274087, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.28442383, + "step": 3007, + "time_per_iteration": 2.7318689823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.06056237, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.06429658550493057, + "language_loss": 0.84402883, + "learning_rate": 0.00039760898674228205, + "loss": 0.85492396, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.28930664, + "step": 3008, + "time_per_iteration": 2.6548941135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.05884826, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.0525681924040606, + "language_loss": 0.80782068, + "learning_rate": 0.0003973040666565613, + "loss": 0.81869543, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.28588867, + "step": 3009, + "time_per_iteration": 3.065049171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087663, + "balance_loss_mlp": 1.05972004, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.058928126410829465, + "language_loss": 0.81879556, + "learning_rate": 0.000396999186440938, + "loss": 0.82967222, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.27954102, + "step": 3010, + "time_per_iteration": 2.860755205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086781, + "balance_loss_mlp": 1.05871928, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06775550082118927, + "language_loss": 0.84739363, + "learning_rate": 0.000396694346213777, + "loss": 0.85826147, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.28076172, + "step": 3011, + "time_per_iteration": 2.591801643371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077556, + "balance_loss_mlp": 1.04815888, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.09075774540794283, + "language_loss": 0.83682388, + "learning_rate": 0.0003963895460934276, + "loss": 0.84759945, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.29370117, + "step": 3012, + "time_per_iteration": 3.1549274921417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_mlp": 1.05242133, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.07824771870324425, + "language_loss": 0.85031927, + "learning_rate": 0.00039608478619822376, + "loss": 0.86112702, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.28344727, + "step": 3013, + "time_per_iteration": 2.436859369277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108003, + "balance_loss_mlp": 1.05091906, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.07454312954276684, + "language_loss": 0.82720006, + "learning_rate": 0.00039578006664648394, + "loss": 0.83800036, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.29125977, + "step": 3014, + "time_per_iteration": 2.813934326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.05350864, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.07429538018047967, + "language_loss": 0.81169355, + "learning_rate": 0.0003954753875565105, + "loss": 0.82251996, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.29101562, + "step": 3015, + "time_per_iteration": 3.089141607284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.04674578, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.053240000714227444, + "language_loss": 0.8237859, + "learning_rate": 0.00039517074904659057, + "loss": 0.8345452, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.29125977, + "step": 3016, + "time_per_iteration": 2.7315711975097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.05217314, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0618256833307492, + "language_loss": 0.84621388, + "learning_rate": 0.00039486615123499535, + "loss": 0.85702527, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.28955078, + "step": 3017, + "time_per_iteration": 2.870152235031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082579, + "balance_loss_mlp": 1.05342066, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.06092979313789558, + "language_loss": 0.85065556, + "learning_rate": 0.00039456159423997996, + "loss": 0.86148143, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.29125977, + "step": 3018, + "time_per_iteration": 2.6494932174682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04867649, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.05170574080230249, + "language_loss": 0.89520943, + "learning_rate": 0.00039425707817978406, + "loss": 0.90599209, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29541016, + "step": 3019, + "time_per_iteration": 2.690485715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_mlp": 1.04894376, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06031161665678942, + "language_loss": 0.83372945, + "learning_rate": 0.00039395260317263124, + "loss": 0.84451568, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.29663086, + "step": 3020, + "time_per_iteration": 2.677818775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076598, + "balance_loss_mlp": 1.0466764, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.056782275650517425, + "language_loss": 0.84907949, + "learning_rate": 0.0003936481693367291, + "loss": 0.8598454, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.29882812, + "step": 3021, + "time_per_iteration": 2.647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_mlp": 1.05491698, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06733027879749674, + "language_loss": 0.87502337, + "learning_rate": 0.0003933437767902697, + "loss": 0.88587123, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.29833984, + "step": 3022, + "time_per_iteration": 2.825965166091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085273, + "balance_loss_mlp": 1.05706787, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.07318564796931465, + "language_loss": 0.78165317, + "learning_rate": 0.00039303942565142825, + "loss": 0.79250592, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.28222656, + "step": 3023, + "time_per_iteration": 2.7315845489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_mlp": 1.0569042, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.052544940996134284, + "language_loss": 0.76741624, + "learning_rate": 0.0003927351160383644, + "loss": 0.77829051, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.3046875, + "step": 3024, + "time_per_iteration": 2.789477825164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085705, + "balance_loss_mlp": 1.05609322, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.07634686348045291, + "language_loss": 0.77796662, + "learning_rate": 0.000392430848069222, + "loss": 0.78882366, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.5446279048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085632, + "balance_loss_mlp": 1.05549598, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05528071963535831, + "language_loss": 0.82223105, + "learning_rate": 0.00039212662186212795, + "loss": 0.83308738, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.30078125, + "step": 3026, + "time_per_iteration": 2.60878849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_mlp": 1.04883003, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.05052748911564131, + "language_loss": 0.76906562, + "learning_rate": 0.0003918224375351934, + "loss": 0.77986145, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.30737305, + "step": 3027, + "time_per_iteration": 2.709887742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_mlp": 1.05384469, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05874903473435042, + "language_loss": 0.78473544, + "learning_rate": 0.0003915182952065135, + "loss": 0.79556859, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29418945, + "step": 3028, + "time_per_iteration": 2.6885859966278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.05250072, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.06824855227929012, + "language_loss": 0.8751812, + "learning_rate": 0.0003912141949941664, + "loss": 0.88600326, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.296875, + "step": 3029, + "time_per_iteration": 2.7145774364471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05799532, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.07682913079591057, + "language_loss": 0.82808822, + "learning_rate": 0.0003909101370162143, + "loss": 0.83896548, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.29711914, + "step": 3030, + "time_per_iteration": 2.6085238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063086, + "balance_loss_mlp": 1.05116475, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.03433679117263603, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73496974, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.11914062, + "step": 3031, + "time_per_iteration": 4.894438028335571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05076766, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.0542485247275347, + "language_loss": 0.8270607, + "learning_rate": 0.0003903021482356622, + "loss": 0.83786714, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29833984, + "step": 3032, + "time_per_iteration": 2.8060503005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079071, + "balance_loss_mlp": 1.04924476, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.06913224268253564, + "language_loss": 0.8243112, + "learning_rate": 0.00038999821766910465, + "loss": 0.8351019, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.2980957, + "step": 3033, + "time_per_iteration": 3.013117551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079849, + "balance_loss_mlp": 1.04992783, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.06539568057172108, + "language_loss": 0.85596031, + "learning_rate": 0.00038969432980902606, + "loss": 0.86675882, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.29907227, + "step": 3034, + "time_per_iteration": 2.602159261703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.03642654, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.02505289654727371, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.8083204, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11132812, + "step": 3035, + "time_per_iteration": 4.8551225662231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05664897, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.05971096981290547, + "language_loss": 0.82545829, + "learning_rate": 0.00038908668268020953, + "loss": 0.8363204, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29516602, + "step": 3036, + "time_per_iteration": 2.6712634563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084003, + "balance_loss_mlp": 1.05455875, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.06020630991976339, + "language_loss": 0.84750116, + "learning_rate": 0.00038878292364738097, + "loss": 0.85834116, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.29418945, + "step": 3037, + "time_per_iteration": 2.774688959121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087202, + "balance_loss_mlp": 1.05785298, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.06330434972052289, + "language_loss": 0.87235534, + "learning_rate": 0.0003884792077928508, + "loss": 0.88322735, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.511212110519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05957842, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.089824175631678, + "language_loss": 0.76556516, + "learning_rate": 0.0003881755352345322, + "loss": 0.77645469, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29345703, + "step": 3039, + "time_per_iteration": 2.5297422409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108977, + "balance_loss_mlp": 1.06039691, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.05409760120739159, + "language_loss": 0.8652333, + "learning_rate": 0.0003878719060903207, + "loss": 0.87613106, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29345703, + "step": 3040, + "time_per_iteration": 2.5606369972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_mlp": 1.05447245, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.07864155094531469, + "language_loss": 0.83092105, + "learning_rate": 0.0003875683204780961, + "loss": 0.84176469, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29833984, + "step": 3041, + "time_per_iteration": 2.7069876194000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_mlp": 1.06128943, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.07084084705837652, + "language_loss": 0.85393965, + "learning_rate": 0.00038726477851572043, + "loss": 0.86485463, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.30175781, + "step": 3042, + "time_per_iteration": 2.785623788833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086169, + "balance_loss_mlp": 1.0566287, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.06883779110535396, + "language_loss": 0.80354905, + "learning_rate": 0.0003869612803210395, + "loss": 0.81441069, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.29541016, + "step": 3043, + "time_per_iteration": 2.635880708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075998, + "balance_loss_mlp": 1.04643369, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.0705585022393511, + "language_loss": 0.83492166, + "learning_rate": 0.0003866578260118817, + "loss": 0.84568161, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29541016, + "step": 3044, + "time_per_iteration": 2.58337664604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074571, + "balance_loss_mlp": 1.04491138, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.06598081480709424, + "language_loss": 0.83220106, + "learning_rate": 0.0003863544157060581, + "loss": 0.84294677, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.29614258, + "step": 3045, + "time_per_iteration": 2.66916561126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079474, + "balance_loss_mlp": 1.04998136, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.05207738102195899, + "language_loss": 0.82137144, + "learning_rate": 0.0003860510495213634, + "loss": 0.83216619, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.29492188, + "step": 3046, + "time_per_iteration": 2.8170437812805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04256272, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.07713217072038757, + "language_loss": 0.78373164, + "learning_rate": 0.0003857477275755746, + "loss": 0.79445338, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.29589844, + "step": 3047, + "time_per_iteration": 2.639801502227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077446, + "balance_loss_mlp": 1.04678559, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.05564403415338841, + "language_loss": 0.84011877, + "learning_rate": 0.00038544444998645167, + "loss": 0.8508932, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.30639648, + "step": 3048, + "time_per_iteration": 3.007289409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_mlp": 1.04754782, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.06801965614795764, + "language_loss": 0.81586641, + "learning_rate": 0.00038514121687173767, + "loss": 0.8266356, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.29345703, + "step": 3049, + "time_per_iteration": 2.637277603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072965, + "balance_loss_mlp": 1.04397368, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.0576990751755922, + "language_loss": 0.81892288, + "learning_rate": 0.00038483802834915807, + "loss": 0.82965243, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.28979492, + "step": 3050, + "time_per_iteration": 2.975592613220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075399, + "balance_loss_mlp": 1.04607356, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.09338183491699942, + "language_loss": 0.78599441, + "learning_rate": 0.00038453488453642074, + "loss": 0.79674846, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29296875, + "step": 3051, + "time_per_iteration": 2.668680429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_mlp": 1.04581618, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.18186948375192843, + "language_loss": 0.86825669, + "learning_rate": 0.00038423178555121697, + "loss": 0.87900746, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.29223633, + "step": 3052, + "time_per_iteration": 2.7119386196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080518, + "balance_loss_mlp": 1.05202711, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.05190046933032045, + "language_loss": 0.85228276, + "learning_rate": 0.00038392873151121994, + "loss": 0.86308795, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.28466797, + "step": 3053, + "time_per_iteration": 3.0532052516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075316, + "balance_loss_mlp": 1.04615784, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.06073215036153007, + "language_loss": 0.830441, + "learning_rate": 0.0003836257225340859, + "loss": 0.84119415, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.29125977, + "step": 3054, + "time_per_iteration": 2.6791739463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077784, + "balance_loss_mlp": 1.04922152, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.053654559033963406, + "language_loss": 0.82283098, + "learning_rate": 0.00038332275873745336, + "loss": 0.83360887, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.28564453, + "step": 3055, + "time_per_iteration": 3.0826737880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085261, + "balance_loss_mlp": 1.05646038, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.07874067829632751, + "language_loss": 0.82649648, + "learning_rate": 0.0003830198402389431, + "loss": 0.83734912, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.28759766, + "step": 3056, + "time_per_iteration": 2.71244215965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.06755841, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.03508304466376378, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78429663, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.13183594, + "step": 3057, + "time_per_iteration": 4.991718053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.05900002, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.0604575145753954, + "language_loss": 0.83162987, + "learning_rate": 0.0003824141396066855, + "loss": 0.84250164, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28198242, + "step": 3058, + "time_per_iteration": 2.62410044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095213, + "balance_loss_mlp": 1.06605411, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05748148757470156, + "language_loss": 0.83195531, + "learning_rate": 0.000382111357708092, + "loss": 0.84290743, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29125977, + "step": 3059, + "time_per_iteration": 2.741142511367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099933, + "balance_loss_mlp": 1.07113242, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.07210182052791281, + "language_loss": 0.83736324, + "learning_rate": 0.00038180862157792864, + "loss": 0.84836257, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.28808594, + "step": 3060, + "time_per_iteration": 2.8028531074523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095663, + "balance_loss_mlp": 1.06733847, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.06185538750618477, + "language_loss": 0.82032192, + "learning_rate": 0.0003815059313337279, + "loss": 0.83127856, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28295898, + "step": 3061, + "time_per_iteration": 2.661663055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.0641377, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.054152956568787894, + "language_loss": 0.78217703, + "learning_rate": 0.00038120328709300436, + "loss": 0.7931028, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.28466797, + "step": 3062, + "time_per_iteration": 2.8524019718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_mlp": 1.0717572, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.07045144115382113, + "language_loss": 0.83619386, + "learning_rate": 0.0003809006889732549, + "loss": 0.84719896, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.28759766, + "step": 3063, + "time_per_iteration": 2.818297863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093698, + "balance_loss_mlp": 1.06554079, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.07166208719676233, + "language_loss": 0.87752122, + "learning_rate": 0.0003805981370919589, + "loss": 0.88845825, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28173828, + "step": 3064, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06352103, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.052273370645306905, + "language_loss": 0.83554685, + "learning_rate": 0.0003802956315665771, + "loss": 0.84646511, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28320312, + "step": 3065, + "time_per_iteration": 2.7017621994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091683, + "balance_loss_mlp": 1.06428885, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.09115739101573021, + "language_loss": 0.81856883, + "learning_rate": 0.0003799931725145529, + "loss": 0.82948571, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.27416992, + "step": 3066, + "time_per_iteration": 2.6396725177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091771, + "balance_loss_mlp": 1.0635426, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.061744960378181175, + "language_loss": 0.85826695, + "learning_rate": 0.00037969076005331083, + "loss": 0.86918467, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28271484, + "step": 3067, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05947697, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.062191843713449865, + "language_loss": 0.87458771, + "learning_rate": 0.00037938839430025817, + "loss": 0.88547218, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.28930664, + "step": 3068, + "time_per_iteration": 2.645289897918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080639, + "balance_loss_mlp": 1.0527916, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.07692636502028646, + "language_loss": 0.85409123, + "learning_rate": 0.0003790860753727835, + "loss": 0.86489761, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.27856445, + "step": 3069, + "time_per_iteration": 2.831932544708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.05966043, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.05698566021180351, + "language_loss": 0.82950222, + "learning_rate": 0.00037878380338825766, + "loss": 0.84037948, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28076172, + "step": 3070, + "time_per_iteration": 2.6856610774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094092, + "balance_loss_mlp": 1.06655455, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.05699607440456078, + "language_loss": 0.81377411, + "learning_rate": 0.00037848157846403287, + "loss": 0.82471496, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.27539062, + "step": 3071, + "time_per_iteration": 2.9222235679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090999, + "balance_loss_mlp": 1.06291366, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04993960868235579, + "language_loss": 0.8303259, + "learning_rate": 0.0003781794007174435, + "loss": 0.84123588, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.28076172, + "step": 3072, + "time_per_iteration": 2.8049426078796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.03702164, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.02139881306535856, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7512219, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.860798597335815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05854619, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.0539637393269004, + "language_loss": 0.81219113, + "learning_rate": 0.0003775751872264152, + "loss": 0.8230564, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.28027344, + "step": 3074, + "time_per_iteration": 2.7820684909820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05267119, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.057314841017187666, + "language_loss": 0.87226552, + "learning_rate": 0.0003772731517165527, + "loss": 0.88307905, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28686523, + "step": 3075, + "time_per_iteration": 2.8264849185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.05383801, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06214529816255618, + "language_loss": 0.83813703, + "learning_rate": 0.0003769711638534784, + "loss": 0.84896386, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28857422, + "step": 3076, + "time_per_iteration": 2.9739084243774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107611, + "balance_loss_mlp": 1.04769087, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06330128127303343, + "language_loss": 0.78904676, + "learning_rate": 0.00037666922375443446, + "loss": 0.79980791, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28417969, + "step": 3077, + "time_per_iteration": 2.611528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076959, + "balance_loss_mlp": 1.04815805, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.0824489675783013, + "language_loss": 0.81633419, + "learning_rate": 0.00037636733153664396, + "loss": 0.82710373, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.2878418, + "step": 3078, + "time_per_iteration": 2.830021619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074589, + "balance_loss_mlp": 1.04547811, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.07220859459639119, + "language_loss": 0.79744393, + "learning_rate": 0.0003760654873173124, + "loss": 0.80818975, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.29077148, + "step": 3079, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069705, + "balance_loss_mlp": 1.04047441, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.0611483797885387, + "language_loss": 0.81661952, + "learning_rate": 0.00037576369121362566, + "loss": 0.82731652, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.29174805, + "step": 3080, + "time_per_iteration": 2.6135458946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073309, + "balance_loss_mlp": 1.0437448, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05261928263256693, + "language_loss": 0.81494981, + "learning_rate": 0.0003754619433427516, + "loss": 0.82568288, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29516602, + "step": 3081, + "time_per_iteration": 2.935394763946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_mlp": 1.04502153, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.07109600442573788, + "language_loss": 0.77291781, + "learning_rate": 0.0003751602438218392, + "loss": 0.78366369, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.29516602, + "step": 3082, + "time_per_iteration": 2.762129306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107369, + "balance_loss_mlp": 1.04410219, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.07081310094320947, + "language_loss": 0.83719951, + "learning_rate": 0.0003748585927680186, + "loss": 0.84793639, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.29589844, + "step": 3083, + "time_per_iteration": 2.6607072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072302, + "balance_loss_mlp": 1.04126024, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.09668658910416093, + "language_loss": 0.82859874, + "learning_rate": 0.00037455699029840086, + "loss": 0.83932179, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.31005859, + "step": 3084, + "time_per_iteration": 2.641989231109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069753, + "balance_loss_mlp": 1.04014122, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.04958887884439868, + "language_loss": 0.84485245, + "learning_rate": 0.0003742554365300787, + "loss": 0.85554999, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.2956543, + "step": 3085, + "time_per_iteration": 2.8070170879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.0440923, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.06324229056117828, + "language_loss": 0.78341657, + "learning_rate": 0.0003739539315801255, + "loss": 0.79416168, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.30371094, + "step": 3086, + "time_per_iteration": 2.937530755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076236, + "balance_loss_mlp": 1.04571867, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.06251001537840323, + "language_loss": 0.91790974, + "learning_rate": 0.000373652475565596, + "loss": 0.92867219, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.3046875, + "step": 3087, + "time_per_iteration": 2.484830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072731, + "balance_loss_mlp": 1.0422616, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.06825336960690286, + "language_loss": 0.81144977, + "learning_rate": 0.00037335106860352587, + "loss": 0.82217705, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.3046875, + "step": 3088, + "time_per_iteration": 2.705796003341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079924, + "balance_loss_mlp": 1.04938293, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.05943406802659928, + "language_loss": 0.83409536, + "learning_rate": 0.00037304971081093146, + "loss": 0.84489465, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.30517578, + "step": 3089, + "time_per_iteration": 2.5424582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_mlp": 1.05015349, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.06149863143832335, + "language_loss": 0.80616403, + "learning_rate": 0.00037274840230481024, + "loss": 0.81697237, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.30664062, + "step": 3090, + "time_per_iteration": 2.7081451416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073853, + "balance_loss_mlp": 1.04407477, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.06332669517454644, + "language_loss": 0.79229522, + "learning_rate": 0.00037244714320214077, + "loss": 0.80303377, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.29736328, + "step": 3091, + "time_per_iteration": 2.5389420986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.05082965, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.061471299239273844, + "language_loss": 0.83137572, + "learning_rate": 0.000372145933619882, + "loss": 0.84218347, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.29931641, + "step": 3092, + "time_per_iteration": 2.8748533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076811, + "balance_loss_mlp": 1.04657912, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05871713315937548, + "language_loss": 0.82114685, + "learning_rate": 0.000371844773674974, + "loss": 0.8319149, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.30224609, + "step": 3093, + "time_per_iteration": 2.6465840339660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_mlp": 1.05346692, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.0642067113719601, + "language_loss": 0.81621695, + "learning_rate": 0.0003715436634843375, + "loss": 0.82704508, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29345703, + "step": 3094, + "time_per_iteration": 2.9084014892578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079615, + "balance_loss_mlp": 1.05007505, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.04814703484993394, + "language_loss": 0.80545932, + "learning_rate": 0.00037124260316487355, + "loss": 0.81625545, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.29516602, + "step": 3095, + "time_per_iteration": 2.8632538318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075577, + "balance_loss_mlp": 1.04727709, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.060441576418101065, + "language_loss": 0.89618301, + "learning_rate": 0.0003709415928334643, + "loss": 0.90693879, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.28344727, + "step": 3096, + "time_per_iteration": 2.6276299953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_mlp": 1.04813242, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.06311167084488892, + "language_loss": 0.80587751, + "learning_rate": 0.00037064063260697233, + "loss": 0.81665254, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.29345703, + "step": 3097, + "time_per_iteration": 2.893503427505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081151, + "balance_loss_mlp": 1.05151534, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.06048648768573219, + "language_loss": 0.78276408, + "learning_rate": 0.0003703397226022407, + "loss": 0.79357558, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.2956543, + "step": 3098, + "time_per_iteration": 3.0289156436920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_mlp": 1.02305758, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.01734603550218104, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76534188, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.11230469, + "step": 3099, + "time_per_iteration": 4.946389436721802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078376, + "balance_loss_mlp": 1.04978967, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.05865367248717621, + "language_loss": 0.83124352, + "learning_rate": 0.0003697380537253339, + "loss": 0.84202731, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.28564453, + "step": 3100, + "time_per_iteration": 2.674445152282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083272, + "balance_loss_mlp": 1.05492401, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.050984632699602635, + "language_loss": 0.81265384, + "learning_rate": 0.0003694372950867471, + "loss": 0.82348651, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28369141, + "step": 3101, + "time_per_iteration": 2.787538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075715, + "balance_loss_mlp": 1.04772449, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05184746467501943, + "language_loss": 0.77182555, + "learning_rate": 0.0003691365871370976, + "loss": 0.78258264, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.2800293, + "step": 3102, + "time_per_iteration": 3.016934871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080662, + "balance_loss_mlp": 1.05271935, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.06482068820490762, + "language_loss": 0.85340202, + "learning_rate": 0.00036883592999313093, + "loss": 0.8642087, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27978516, + "step": 3103, + "time_per_iteration": 2.689819812774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079629, + "balance_loss_mlp": 1.05218673, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.06496745505902583, + "language_loss": 0.79311585, + "learning_rate": 0.0003685353237715722, + "loss": 0.8039121, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27490234, + "step": 3104, + "time_per_iteration": 2.87333083152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_mlp": 1.05504966, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.051730016495621756, + "language_loss": 0.8144263, + "learning_rate": 0.0003682347685891274, + "loss": 0.82525891, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.28222656, + "step": 3105, + "time_per_iteration": 2.888319730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080866, + "balance_loss_mlp": 1.05228007, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.060164631065922125, + "language_loss": 0.80393469, + "learning_rate": 0.0003679342645624822, + "loss": 0.8147434, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.28564453, + "step": 3106, + "time_per_iteration": 3.0317325592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.0513438, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.057913897832382336, + "language_loss": 0.81649029, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728529, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.28198242, + "step": 3107, + "time_per_iteration": 2.9762744903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_mlp": 1.05519295, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05706871104479872, + "language_loss": 0.79560876, + "learning_rate": 0.0003673334104432347, + "loss": 0.80644441, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.28393555, + "step": 3108, + "time_per_iteration": 2.5976645946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.0530827, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.06092677674045173, + "language_loss": 0.83641863, + "learning_rate": 0.0003670330605839048, + "loss": 0.84723055, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.28125, + "step": 3109, + "time_per_iteration": 2.819420337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082632, + "balance_loss_mlp": 1.05480886, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.0537112811211955, + "language_loss": 0.76695013, + "learning_rate": 0.0003667327623469191, + "loss": 0.77777648, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27832031, + "step": 3110, + "time_per_iteration": 2.766671657562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085165, + "balance_loss_mlp": 1.05753255, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.058546063064310164, + "language_loss": 0.77618361, + "learning_rate": 0.00036643251584886333, + "loss": 0.78703523, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27661133, + "step": 3111, + "time_per_iteration": 2.789184808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077786, + "balance_loss_mlp": 1.05105901, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.054896589550954444, + "language_loss": 0.81872785, + "learning_rate": 0.00036613232120630393, + "loss": 0.82950568, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.26782227, + "step": 3112, + "time_per_iteration": 2.5881965160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081611, + "balance_loss_mlp": 1.05362022, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.07437964171487202, + "language_loss": 0.80355418, + "learning_rate": 0.00036583217853578643, + "loss": 0.81437027, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.27978516, + "step": 3113, + "time_per_iteration": 2.5409529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05457568, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.06261379626444472, + "language_loss": 0.77366924, + "learning_rate": 0.000365532087953837, + "loss": 0.78449941, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.28442383, + "step": 3114, + "time_per_iteration": 3.6426267623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076465, + "balance_loss_mlp": 1.04842734, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.08299057980597005, + "language_loss": 0.88937151, + "learning_rate": 0.00036523204957696065, + "loss": 0.90013611, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.28051758, + "step": 3115, + "time_per_iteration": 2.594581365585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_mlp": 1.05623841, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.06140193987839019, + "language_loss": 0.80620509, + "learning_rate": 0.00036493206352164324, + "loss": 0.81704283, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.27612305, + "step": 3116, + "time_per_iteration": 2.922367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076912, + "balance_loss_mlp": 1.04942214, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05345315057842072, + "language_loss": 0.85505688, + "learning_rate": 0.000364632129904349, + "loss": 0.86582601, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.27514648, + "step": 3117, + "time_per_iteration": 2.765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077238, + "balance_loss_mlp": 1.04884195, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05997451129778301, + "language_loss": 0.77705157, + "learning_rate": 0.00036433224884152283, + "loss": 0.78782398, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.28393555, + "step": 3118, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078485, + "balance_loss_mlp": 1.05032814, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.06439508839737945, + "language_loss": 0.77913392, + "learning_rate": 0.00036403242044958875, + "loss": 0.78991878, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28173828, + "step": 3119, + "time_per_iteration": 2.5515971183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.04563642, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05980235429893482, + "language_loss": 0.91155994, + "learning_rate": 0.0003637326448449507, + "loss": 0.9222945, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.27832031, + "step": 3120, + "time_per_iteration": 2.7075581550598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04651034, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.046913105653204425, + "language_loss": 0.86206967, + "learning_rate": 0.00036343292214399177, + "loss": 0.87282228, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.28735352, + "step": 3121, + "time_per_iteration": 2.8623263835906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076118, + "balance_loss_mlp": 1.04786551, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.08364408748252802, + "language_loss": 0.77170986, + "learning_rate": 0.00036313325246307456, + "loss": 0.782471, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28271484, + "step": 3122, + "time_per_iteration": 2.8064393997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04845548, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05351137159491715, + "language_loss": 0.86973262, + "learning_rate": 0.0003628336359185411, + "loss": 0.88050497, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.28759766, + "step": 3123, + "time_per_iteration": 2.701089859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074232, + "balance_loss_mlp": 1.04545498, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.061635029106804545, + "language_loss": 0.75553113, + "learning_rate": 0.000362534072626713, + "loss": 0.76627344, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.28759766, + "step": 3124, + "time_per_iteration": 2.7586216926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076514, + "balance_loss_mlp": 1.04830909, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05599212147105787, + "language_loss": 0.81046546, + "learning_rate": 0.00036223456270389093, + "loss": 0.82123059, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.2824707, + "step": 3125, + "time_per_iteration": 2.948882818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04442525, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.05186484782469995, + "language_loss": 0.81019723, + "learning_rate": 0.00036193510626635517, + "loss": 0.82094145, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.29980469, + "step": 3126, + "time_per_iteration": 2.671576499938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073529, + "balance_loss_mlp": 1.04410863, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.05950376235873218, + "language_loss": 0.81565017, + "learning_rate": 0.0003616357034303649, + "loss": 0.82638544, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.29370117, + "step": 3127, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074144, + "balance_loss_mlp": 1.04541481, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.048316094410884414, + "language_loss": 0.78690076, + "learning_rate": 0.0003613363543121584, + "loss": 0.79764223, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.28735352, + "step": 3128, + "time_per_iteration": 2.873584508895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04766035, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05627549899999149, + "language_loss": 0.8521632, + "learning_rate": 0.00036103705902795357, + "loss": 0.8629328, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.29248047, + "step": 3129, + "time_per_iteration": 2.721329689025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074169, + "balance_loss_mlp": 1.04434288, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.06933558951012796, + "language_loss": 0.7955035, + "learning_rate": 0.0003607378176939471, + "loss": 0.80624521, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.29785156, + "step": 3130, + "time_per_iteration": 2.672825574874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070174, + "balance_loss_mlp": 1.04118252, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.07276264365929157, + "language_loss": 0.82265472, + "learning_rate": 0.00036043863042631465, + "loss": 0.8333565, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.29003906, + "step": 3131, + "time_per_iteration": 2.724228858947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03918386, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.06054022798216566, + "language_loss": 0.76351178, + "learning_rate": 0.00036013949734121133, + "loss": 0.77419853, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.29467773, + "step": 3132, + "time_per_iteration": 3.1145389080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068619, + "balance_loss_mlp": 1.03831553, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.061447218218141524, + "language_loss": 0.82303023, + "learning_rate": 0.00035984041855477043, + "loss": 0.83371639, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.30249023, + "step": 3133, + "time_per_iteration": 2.779906749725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_mlp": 1.01274288, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.015590695702157922, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79734081, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.11425781, + "step": 3134, + "time_per_iteration": 4.933319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064388, + "balance_loss_mlp": 1.03503895, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.05335614021413427, + "language_loss": 0.79509521, + "learning_rate": 0.00035924242434230637, + "loss": 0.80573905, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.29321289, + "step": 3135, + "time_per_iteration": 2.6558902263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065788, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.07899589356076418, + "language_loss": 0.78020877, + "learning_rate": 0.00035894350914844516, + "loss": 0.79086667, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.28881836, + "step": 3136, + "time_per_iteration": 2.631028175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_mlp": 1.03927457, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06724246097152477, + "language_loss": 0.8242653, + "learning_rate": 0.0003586446487175703, + "loss": 0.83495319, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.29516602, + "step": 3137, + "time_per_iteration": 2.6988327503204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03866601, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.053597642089091506, + "language_loss": 0.85091925, + "learning_rate": 0.0003583458431657099, + "loss": 0.86160386, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.29760742, + "step": 3138, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067135, + "balance_loss_mlp": 1.03735673, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.06925518043051447, + "language_loss": 0.83323741, + "learning_rate": 0.00035804709260887056, + "loss": 0.84390879, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.29711914, + "step": 3139, + "time_per_iteration": 2.664776563644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069913, + "balance_loss_mlp": 1.04013443, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.05868516129691736, + "language_loss": 0.894665, + "learning_rate": 0.0003577483971630373, + "loss": 0.90536416, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.29760742, + "step": 3140, + "time_per_iteration": 2.659006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069941, + "balance_loss_mlp": 1.03982854, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0462994946970423, + "language_loss": 0.85074717, + "learning_rate": 0.00035744975694417414, + "loss": 0.86144656, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.30078125, + "step": 3141, + "time_per_iteration": 2.9323952198028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073401, + "balance_loss_mlp": 1.04438555, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.06410322202016926, + "language_loss": 0.82079303, + "learning_rate": 0.00035715117206822344, + "loss": 0.83152711, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.28979492, + "step": 3142, + "time_per_iteration": 2.8329904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070447, + "balance_loss_mlp": 1.04145527, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.060439068049678774, + "language_loss": 0.80993617, + "learning_rate": 0.0003568526426511065, + "loss": 0.82064068, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.28979492, + "step": 3143, + "time_per_iteration": 2.695185899734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_mlp": 1.0432328, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.06755719072358204, + "language_loss": 0.82702982, + "learning_rate": 0.000356554168808722, + "loss": 0.83775228, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.29003906, + "step": 3144, + "time_per_iteration": 2.9742469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04537654, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05422673748867178, + "language_loss": 0.84676063, + "learning_rate": 0.00035625575065694837, + "loss": 0.85749412, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.2800293, + "step": 3145, + "time_per_iteration": 2.8367791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04934883, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05280732268922785, + "language_loss": 0.77452278, + "learning_rate": 0.0003559573883116415, + "loss": 0.78530073, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28466797, + "step": 3146, + "time_per_iteration": 2.701388120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075301, + "balance_loss_mlp": 1.04702449, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.04869973207051341, + "language_loss": 0.85634321, + "learning_rate": 0.00035565908188863604, + "loss": 0.86709619, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.28271484, + "step": 3147, + "time_per_iteration": 2.898590087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076445, + "balance_loss_mlp": 1.04831183, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.06327080100476104, + "language_loss": 0.79599166, + "learning_rate": 0.00035536083150374464, + "loss": 0.80675614, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.28149414, + "step": 3148, + "time_per_iteration": 2.771320343017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_mlp": 1.00905097, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.011512942764516735, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75768542, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.11523438, + "step": 3149, + "time_per_iteration": 4.814287185668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077389, + "balance_loss_mlp": 1.04918396, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.05840631409964381, + "language_loss": 0.85528827, + "learning_rate": 0.0003547644993114475, + "loss": 0.86606216, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.28198242, + "step": 3150, + "time_per_iteration": 2.8378889560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107623, + "balance_loss_mlp": 1.04795372, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.06870733473036895, + "language_loss": 0.7981267, + "learning_rate": 0.00035446641773555806, + "loss": 0.80888903, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.28295898, + "step": 3151, + "time_per_iteration": 2.7372798919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077461, + "balance_loss_mlp": 1.04916036, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.05718786699526154, + "language_loss": 0.86853182, + "learning_rate": 0.000354168392660816, + "loss": 0.87930644, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.28344727, + "step": 3152, + "time_per_iteration": 2.7871758937835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073815, + "balance_loss_mlp": 1.04558635, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05898712641381182, + "language_loss": 0.82702786, + "learning_rate": 0.0003538704242029252, + "loss": 0.83776605, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.28222656, + "step": 3153, + "time_per_iteration": 2.700695753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.0467577, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.06128602508798912, + "language_loss": 0.7773366, + "learning_rate": 0.0003535725124775672, + "loss": 0.78808761, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28344727, + "step": 3154, + "time_per_iteration": 2.8570618629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076573, + "balance_loss_mlp": 1.0478195, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.055885875690184536, + "language_loss": 0.86403567, + "learning_rate": 0.00035327465760040126, + "loss": 0.8748014, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.28710938, + "step": 3155, + "time_per_iteration": 2.6846063137054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04281223, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.06048889768089712, + "language_loss": 0.84499794, + "learning_rate": 0.00035297685968706526, + "loss": 0.85571855, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.29223633, + "step": 3156, + "time_per_iteration": 2.7771387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072214, + "balance_loss_mlp": 1.04453337, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.06250295268242392, + "language_loss": 0.83014715, + "learning_rate": 0.00035267911885317454, + "loss": 0.84086931, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.27709961, + "step": 3157, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074037, + "balance_loss_mlp": 1.0442822, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.057378940891661595, + "language_loss": 0.81611866, + "learning_rate": 0.0003523814352143222, + "loss": 0.826859, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.29711914, + "step": 3158, + "time_per_iteration": 2.830617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04883063, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.0599841254590138, + "language_loss": 0.90816242, + "learning_rate": 0.00035208380888607937, + "loss": 0.91893965, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.28881836, + "step": 3159, + "time_per_iteration": 2.8117706775665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009022, + "balance_loss_mlp": 0.99786437, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.007967889265398313, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80471009, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.11181641, + "step": 3160, + "time_per_iteration": 4.8633644580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009246, + "balance_loss_mlp": 0.998088, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.00797101191785885, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76701474, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.11181641, + "step": 3161, + "time_per_iteration": 5.046196460723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075203, + "balance_loss_mlp": 1.04611611, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.04533613724441275, + "language_loss": 0.81858671, + "learning_rate": 0.00035119127492038446, + "loss": 0.82933867, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.29077148, + "step": 3162, + "time_per_iteration": 2.815852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075143, + "balance_loss_mlp": 1.0469625, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.053216451363019494, + "language_loss": 0.82787645, + "learning_rate": 0.00035089387898984436, + "loss": 0.83862782, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.059666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075751, + "balance_loss_mlp": 1.04683065, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.06412835192713194, + "language_loss": 0.81799018, + "learning_rate": 0.0003505965409474343, + "loss": 0.82874769, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28881836, + "step": 3164, + "time_per_iteration": 2.8909780979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072573, + "balance_loss_mlp": 1.04374802, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.050432732030132946, + "language_loss": 0.86329949, + "learning_rate": 0.0003502992609085913, + "loss": 0.87402523, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.28808594, + "step": 3165, + "time_per_iteration": 2.66687273979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.04513407, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.053888239650619583, + "language_loss": 0.82507217, + "learning_rate": 0.00035000203898872954, + "loss": 0.83581889, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.29516602, + "step": 3166, + "time_per_iteration": 3.05118989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04303908, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.06623841355558525, + "language_loss": 0.84253997, + "learning_rate": 0.0003497048753032406, + "loss": 0.85326171, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.29125977, + "step": 3167, + "time_per_iteration": 2.87467885017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074156, + "balance_loss_mlp": 1.04473543, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.05347521996771115, + "language_loss": 0.80754191, + "learning_rate": 0.000349407769967494, + "loss": 0.81828344, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.29394531, + "step": 3168, + "time_per_iteration": 3.3934104442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074195, + "balance_loss_mlp": 1.04546547, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.10902305889023324, + "language_loss": 0.84663367, + "learning_rate": 0.0003491107230968361, + "loss": 0.85737562, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.28710938, + "step": 3169, + "time_per_iteration": 2.6888718605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_mlp": 1.04351735, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.05661622017927931, + "language_loss": 0.81418574, + "learning_rate": 0.00034881373480659085, + "loss": 0.82490849, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.28735352, + "step": 3170, + "time_per_iteration": 2.820013999938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073009, + "balance_loss_mlp": 1.043993, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.0573564735722831, + "language_loss": 0.78202963, + "learning_rate": 0.0003485168052120594, + "loss": 0.79275972, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.2902832, + "step": 3171, + "time_per_iteration": 2.5298008918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108136, + "balance_loss_mlp": 1.05255914, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.06128596263952344, + "language_loss": 0.79907572, + "learning_rate": 0.00034821993442851973, + "loss": 0.80988932, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.28808594, + "step": 3172, + "time_per_iteration": 2.5819344520568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075018, + "balance_loss_mlp": 1.0474807, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.06156265055034652, + "language_loss": 0.82331789, + "learning_rate": 0.00034792312257122735, + "loss": 0.83406806, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.27612305, + "step": 3173, + "time_per_iteration": 2.621645212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070505, + "balance_loss_mlp": 1.04187059, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.059872220515584544, + "language_loss": 0.80486125, + "learning_rate": 0.00034762636975541506, + "loss": 0.8155663, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.28613281, + "step": 3174, + "time_per_iteration": 2.6323647499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074186, + "balance_loss_mlp": 1.0451467, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.05798479282712576, + "language_loss": 0.81059682, + "learning_rate": 0.0003473296760962923, + "loss": 0.82133865, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2902832, + "step": 3175, + "time_per_iteration": 2.679593324661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018524, + "balance_loss_mlp": 1.007128, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.01318817873369303, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79552263, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.11376953, + "step": 3176, + "time_per_iteration": 4.708170652389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075937, + "balance_loss_mlp": 1.04811323, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.06988374073618883, + "language_loss": 0.81172955, + "learning_rate": 0.00034673646670883976, + "loss": 0.82248896, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.27832031, + "step": 3177, + "time_per_iteration": 3.0760982036590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018443, + "balance_loss_mlp": 1.00714159, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.012123406085696703, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76733464, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.11279297, + "step": 3178, + "time_per_iteration": 5.047900199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.04909086, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.06496983177026339, + "language_loss": 0.81433582, + "learning_rate": 0.0003461434953300865, + "loss": 0.82510948, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.28271484, + "step": 3179, + "time_per_iteration": 2.934129476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.0462321, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.054564857541299305, + "language_loss": 0.81309831, + "learning_rate": 0.0003458470991817515, + "loss": 0.82384884, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.28808594, + "step": 3180, + "time_per_iteration": 2.9692420959472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.05249786, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056066758208496104, + "language_loss": 0.84904051, + "learning_rate": 0.0003455507628808802, + "loss": 0.85985035, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.28491211, + "step": 3181, + "time_per_iteration": 2.613642692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107824, + "balance_loss_mlp": 1.04986787, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.07624020954576015, + "language_loss": 0.84440458, + "learning_rate": 0.00034525448654252076, + "loss": 0.855187, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.28369141, + "step": 3182, + "time_per_iteration": 2.6653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074575, + "balance_loss_mlp": 1.04701424, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.06355946830094689, + "language_loss": 0.82891977, + "learning_rate": 0.0003449582702816976, + "loss": 0.83966547, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.27587891, + "step": 3183, + "time_per_iteration": 2.6951351165771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05404711, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.056298205322627685, + "language_loss": 0.82360494, + "learning_rate": 0.0003446621142134122, + "loss": 0.83442801, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.28271484, + "step": 3184, + "time_per_iteration": 2.6690409183502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077624, + "balance_loss_mlp": 1.04958582, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.06604074574998081, + "language_loss": 0.84192419, + "learning_rate": 0.0003443660184526424, + "loss": 0.85270047, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.28051758, + "step": 3185, + "time_per_iteration": 2.4451961517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078036, + "balance_loss_mlp": 1.04949737, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.0548279179658957, + "language_loss": 0.86286807, + "learning_rate": 0.0003440699831143429, + "loss": 0.87364841, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.28515625, + "step": 3186, + "time_per_iteration": 2.7583630084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04989386, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05592702907616355, + "language_loss": 0.81846583, + "learning_rate": 0.0003437740083134449, + "loss": 0.82924777, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.28344727, + "step": 3187, + "time_per_iteration": 2.6769111156463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107819, + "balance_loss_mlp": 1.05053306, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.07534478934925966, + "language_loss": 0.82936466, + "learning_rate": 0.00034347809416485574, + "loss": 0.84014654, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27709961, + "step": 3188, + "time_per_iteration": 2.579110622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05052042, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.05208625136089098, + "language_loss": 0.8201586, + "learning_rate": 0.0003431822407834597, + "loss": 0.83094943, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.28588867, + "step": 3189, + "time_per_iteration": 2.800846815109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.05084062, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.06054576051189374, + "language_loss": 0.84436607, + "learning_rate": 0.00034288644828411706, + "loss": 0.85516232, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.28735352, + "step": 3190, + "time_per_iteration": 3.459338426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05513883, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.0818478077901872, + "language_loss": 0.75477004, + "learning_rate": 0.0003425907167816649, + "loss": 0.7656135, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.29150391, + "step": 3191, + "time_per_iteration": 2.874662399291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.05148816, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.06137447834473829, + "language_loss": 0.84648186, + "learning_rate": 0.00034229504639091623, + "loss": 0.85728073, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.28393555, + "step": 3192, + "time_per_iteration": 2.768174171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078463, + "balance_loss_mlp": 1.04906654, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.05748161960079173, + "language_loss": 0.80287862, + "learning_rate": 0.0003419994372266606, + "loss": 0.81366324, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.29345703, + "step": 3193, + "time_per_iteration": 3.1592228412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05054975, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04575030988697244, + "language_loss": 0.81596744, + "learning_rate": 0.00034170388940366335, + "loss": 0.82676071, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.2878418, + "step": 3194, + "time_per_iteration": 2.707101345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078794, + "balance_loss_mlp": 1.05011201, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05557650302359453, + "language_loss": 0.79986775, + "learning_rate": 0.0003414084030366667, + "loss": 0.81065571, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.28686523, + "step": 3195, + "time_per_iteration": 3.086768388748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04118395, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05715110105949097, + "language_loss": 0.82949638, + "learning_rate": 0.0003411129782403883, + "loss": 0.84020627, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.29760742, + "step": 3196, + "time_per_iteration": 2.65775203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078782, + "balance_loss_mlp": 1.04926562, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.06094401033818373, + "language_loss": 0.8473599, + "learning_rate": 0.0003408176151295225, + "loss": 0.8581478, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.29516602, + "step": 3197, + "time_per_iteration": 2.6118876934051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076412, + "balance_loss_mlp": 1.04806376, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.056153389528983695, + "language_loss": 0.7719816, + "learning_rate": 0.00034052231381873944, + "loss": 0.78274572, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.28320312, + "step": 3198, + "time_per_iteration": 2.6228411197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05066109, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.07032084774443613, + "language_loss": 0.84981108, + "learning_rate": 0.00034022707442268494, + "loss": 0.86060715, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.28955078, + "step": 3199, + "time_per_iteration": 2.6281561851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.05204892, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04792292414356855, + "language_loss": 0.81849301, + "learning_rate": 0.0003399318970559813, + "loss": 0.82930362, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.28979492, + "step": 3200, + "time_per_iteration": 2.848755121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083137, + "balance_loss_mlp": 1.05426502, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.06290240151644533, + "language_loss": 0.8428275, + "learning_rate": 0.00033963678183322656, + "loss": 0.85365885, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.28833008, + "step": 3201, + "time_per_iteration": 3.027029275894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083363, + "balance_loss_mlp": 1.05396593, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.050860435501305326, + "language_loss": 0.8262167, + "learning_rate": 0.0003393417288689945, + "loss": 0.83705032, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.29370117, + "step": 3202, + "time_per_iteration": 2.6697185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05422282, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.07354923140459588, + "language_loss": 0.75762349, + "learning_rate": 0.00033904673827783504, + "loss": 0.76847088, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.3046875, + "step": 3203, + "time_per_iteration": 2.9294135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083321, + "balance_loss_mlp": 1.05423403, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.060707114262551334, + "language_loss": 0.8162061, + "learning_rate": 0.00033875181017427357, + "loss": 0.82703936, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.2902832, + "step": 3204, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078594, + "balance_loss_mlp": 1.04924512, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.054344968838841615, + "language_loss": 0.80957687, + "learning_rate": 0.00033845694467281133, + "loss": 0.82036287, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.29321289, + "step": 3205, + "time_per_iteration": 2.846841812133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.0531013, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.06726799818780427, + "language_loss": 0.83033085, + "learning_rate": 0.00033816214188792516, + "loss": 0.84114861, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.28686523, + "step": 3206, + "time_per_iteration": 3.1646995544433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_mlp": 1.05008507, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.05376278097292006, + "language_loss": 0.8520205, + "learning_rate": 0.00033786740193406784, + "loss": 0.86280841, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.28686523, + "step": 3207, + "time_per_iteration": 2.577228307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075976, + "balance_loss_mlp": 1.04767549, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.056191099229546404, + "language_loss": 0.81319952, + "learning_rate": 0.00033757272492566736, + "loss": 0.82395929, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.28320312, + "step": 3208, + "time_per_iteration": 2.8721108436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078583, + "balance_loss_mlp": 1.05013978, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.04893199519437597, + "language_loss": 0.87034678, + "learning_rate": 0.0003372781109771278, + "loss": 0.8811326, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.28442383, + "step": 3209, + "time_per_iteration": 2.7287070751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077966, + "balance_loss_mlp": 1.04907, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.04879640412841063, + "language_loss": 0.76108795, + "learning_rate": 0.0003369835602028281, + "loss": 0.77186757, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.28881836, + "step": 3210, + "time_per_iteration": 2.8439886569976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04924726, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.055192186653408186, + "language_loss": 0.79211128, + "learning_rate": 0.0003366890727171232, + "loss": 0.80289745, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.29345703, + "step": 3211, + "time_per_iteration": 2.6932919025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.0535692, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.07153817197124837, + "language_loss": 0.78408551, + "learning_rate": 0.00033639464863434313, + "loss": 0.79490948, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.2878418, + "step": 3212, + "time_per_iteration": 2.6900713443756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_mlp": 1.02929533, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.01617816391785494, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79482591, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.10839844, + "step": 3213, + "time_per_iteration": 4.7103211879730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077859, + "balance_loss_mlp": 1.04979765, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.0586976807946241, + "language_loss": 0.79730934, + "learning_rate": 0.00033580599113475543, + "loss": 0.80808794, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.28076172, + "step": 3214, + "time_per_iteration": 2.972890853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076015, + "balance_loss_mlp": 1.04759574, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.06601952737269029, + "language_loss": 0.85816491, + "learning_rate": 0.00033551175794648507, + "loss": 0.86892509, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.28417969, + "step": 3215, + "time_per_iteration": 2.456907033920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.04439735, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.062254504168561625, + "language_loss": 0.8188296, + "learning_rate": 0.00033521758861821365, + "loss": 0.82955682, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.28344727, + "step": 3216, + "time_per_iteration": 2.580777406692505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_mlp": 1.04071391, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.04883960048827372, + "language_loss": 0.88878882, + "learning_rate": 0.0003349234832641479, + "loss": 0.89947987, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.28417969, + "step": 3217, + "time_per_iteration": 2.5541629791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074942, + "balance_loss_mlp": 1.04635608, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.06561076665766134, + "language_loss": 0.80879915, + "learning_rate": 0.00033462944199846975, + "loss": 0.81954861, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.28540039, + "step": 3218, + "time_per_iteration": 3.062703847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077215, + "balance_loss_mlp": 1.04848528, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.06502548187197098, + "language_loss": 0.8618629, + "learning_rate": 0.00033433546493533606, + "loss": 0.87263501, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.28710938, + "step": 3219, + "time_per_iteration": 2.4797823429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072308, + "balance_loss_mlp": 1.04443645, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.06173556799123847, + "language_loss": 0.840487, + "learning_rate": 0.00033404155218887897, + "loss": 0.85121012, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27880859, + "step": 3220, + "time_per_iteration": 2.7182207107543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04733968, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.08803961295836986, + "language_loss": 0.87216806, + "learning_rate": 0.00033374770387320534, + "loss": 0.88291949, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.27856445, + "step": 3221, + "time_per_iteration": 2.7941346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078924, + "balance_loss_mlp": 1.05095768, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.055815039151530264, + "language_loss": 0.84867358, + "learning_rate": 0.00033345392010239737, + "loss": 0.8594628, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.27978516, + "step": 3222, + "time_per_iteration": 2.710803747177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082482, + "balance_loss_mlp": 1.05432487, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05804972472550271, + "language_loss": 0.82259816, + "learning_rate": 0.0003331602009905118, + "loss": 0.83342302, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.28198242, + "step": 3223, + "time_per_iteration": 2.8335556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081003, + "balance_loss_mlp": 1.052917, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.05452675895151675, + "language_loss": 0.83620667, + "learning_rate": 0.00033286654665158085, + "loss": 0.84701669, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.28100586, + "step": 3224, + "time_per_iteration": 2.929290533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.05038977, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.05879630449885449, + "language_loss": 0.87538344, + "learning_rate": 0.0003325729571996109, + "loss": 0.88616055, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.27368164, + "step": 3225, + "time_per_iteration": 2.6219499111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_mlp": 1.04980159, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.06449737595715416, + "language_loss": 0.83818585, + "learning_rate": 0.000332279432748584, + "loss": 0.84897381, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.28955078, + "step": 3226, + "time_per_iteration": 2.7298083305358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082841, + "balance_loss_mlp": 1.054636, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.05904408165059124, + "language_loss": 0.87270737, + "learning_rate": 0.00033198597341245576, + "loss": 0.88353574, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.28222656, + "step": 3227, + "time_per_iteration": 2.5691256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108032, + "balance_loss_mlp": 1.05151939, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.053113519370634896, + "language_loss": 0.81682974, + "learning_rate": 0.00033169257930515763, + "loss": 0.82763296, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.2878418, + "step": 3228, + "time_per_iteration": 3.0353121757507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05587709, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.059839903219207714, + "language_loss": 0.82242584, + "learning_rate": 0.0003313992505405951, + "loss": 0.83327174, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.28686523, + "step": 3229, + "time_per_iteration": 2.720705270767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04743469, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.0642388463301134, + "language_loss": 0.80858111, + "learning_rate": 0.0003311059872326487, + "loss": 0.81933248, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27709961, + "step": 3230, + "time_per_iteration": 2.6720995903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.05352879, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.049445896607163295, + "language_loss": 0.78987181, + "learning_rate": 0.0003308127894951734, + "loss": 0.80068845, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.28149414, + "step": 3231, + "time_per_iteration": 2.63030743598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107994, + "balance_loss_mlp": 1.05214071, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.07248200651444572, + "language_loss": 0.86507577, + "learning_rate": 0.00033051965744199834, + "loss": 0.87587512, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27832031, + "step": 3232, + "time_per_iteration": 2.7564406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05302238, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.05351658478199456, + "language_loss": 0.90184295, + "learning_rate": 0.0003302265911869276, + "loss": 0.91265333, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.28051758, + "step": 3233, + "time_per_iteration": 2.9271633625030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081705, + "balance_loss_mlp": 1.05373812, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.056002159029406404, + "language_loss": 0.84084082, + "learning_rate": 0.0003299335908437397, + "loss": 0.85165787, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.2800293, + "step": 3234, + "time_per_iteration": 2.5909643173217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080844, + "balance_loss_mlp": 1.05228114, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.06942928938800572, + "language_loss": 0.79645211, + "learning_rate": 0.0003296406565261873, + "loss": 0.80726051, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.28564453, + "step": 3235, + "time_per_iteration": 2.5319809913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107822, + "balance_loss_mlp": 1.04927599, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04882824212942084, + "language_loss": 0.8475616, + "learning_rate": 0.0003293477883479978, + "loss": 0.85834384, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.28955078, + "step": 3236, + "time_per_iteration": 2.8348751068115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079457, + "balance_loss_mlp": 1.05110943, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06517457110491971, + "language_loss": 0.79784298, + "learning_rate": 0.0003290549864228727, + "loss": 0.80863756, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.28369141, + "step": 3237, + "time_per_iteration": 2.9205360412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078288, + "balance_loss_mlp": 1.04934406, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.05190818630751583, + "language_loss": 0.86413801, + "learning_rate": 0.0003287622508644875, + "loss": 0.8749209, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.28930664, + "step": 3238, + "time_per_iteration": 2.7504210472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04736114, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.06410601543922713, + "language_loss": 0.8596704, + "learning_rate": 0.0003284695817864923, + "loss": 0.8704325, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.28808594, + "step": 3239, + "time_per_iteration": 2.487185001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.0541544, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.07028564715864687, + "language_loss": 0.83921337, + "learning_rate": 0.0003281769793025116, + "loss": 0.85003626, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28149414, + "step": 3240, + "time_per_iteration": 2.7399847507476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107903, + "balance_loss_mlp": 1.05106378, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.06749958965512537, + "language_loss": 0.89295518, + "learning_rate": 0.00032788444352614346, + "loss": 0.90374541, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.27978516, + "step": 3241, + "time_per_iteration": 2.550497531890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.05055451, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05896628136636162, + "language_loss": 0.80561244, + "learning_rate": 0.0003275919745709606, + "loss": 0.81640697, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.28881836, + "step": 3242, + "time_per_iteration": 2.5805697441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107483, + "balance_loss_mlp": 1.0460763, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.058276556279693525, + "language_loss": 0.8216207, + "learning_rate": 0.00032729957255050936, + "loss": 0.83236909, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.28759766, + "step": 3243, + "time_per_iteration": 2.6520867347717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075457, + "balance_loss_mlp": 1.0462271, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.0677841364318074, + "language_loss": 0.81232285, + "learning_rate": 0.0003270072375783102, + "loss": 0.82307744, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.29174805, + "step": 3244, + "time_per_iteration": 2.8922722339630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079597, + "balance_loss_mlp": 1.05098701, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.055818323982708785, + "language_loss": 0.7931875, + "learning_rate": 0.00032671496976785774, + "loss": 0.80398345, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.28613281, + "step": 3245, + "time_per_iteration": 2.6470372676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04359281, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04960718098470409, + "language_loss": 0.75533414, + "learning_rate": 0.0003264227692326205, + "loss": 0.76605284, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.28295898, + "step": 3246, + "time_per_iteration": 3.0302975177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079718, + "balance_loss_mlp": 1.05010653, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.054579168692914876, + "language_loss": 0.85738158, + "learning_rate": 0.00032613063608604055, + "loss": 0.86817873, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.29589844, + "step": 3247, + "time_per_iteration": 2.529571771621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_mlp": 1.05147064, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.054889772992989326, + "language_loss": 0.8363654, + "learning_rate": 0.0003258385704415343, + "loss": 0.84716547, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.28540039, + "step": 3248, + "time_per_iteration": 2.590259313583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076974, + "balance_loss_mlp": 1.04745758, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.0554200225727057, + "language_loss": 0.82566541, + "learning_rate": 0.0003255465724124915, + "loss": 0.8364352, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.29492188, + "step": 3249, + "time_per_iteration": 2.6928865909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.05044842, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.051820175568143126, + "language_loss": 0.82984078, + "learning_rate": 0.00032525464211227587, + "loss": 0.84063572, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2902832, + "step": 3250, + "time_per_iteration": 2.5911831855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.0519259, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05767056492483943, + "language_loss": 0.85669184, + "learning_rate": 0.0003249627796542249, + "loss": 0.86749554, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.28442383, + "step": 3251, + "time_per_iteration": 2.6558287143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_mlp": 1.04481697, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.0553994194583659, + "language_loss": 0.84238529, + "learning_rate": 0.00032467098515164943, + "loss": 0.85312456, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.29077148, + "step": 3252, + "time_per_iteration": 2.8710081577301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010798, + "balance_loss_mlp": 1.04992628, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.0724295756751151, + "language_loss": 0.83990276, + "learning_rate": 0.00032437925871783456, + "loss": 0.85070074, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.2980957, + "step": 3253, + "time_per_iteration": 2.680757761001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077647, + "balance_loss_mlp": 1.04824996, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06297548912406484, + "language_loss": 0.84215987, + "learning_rate": 0.00032408760046603803, + "loss": 0.85293639, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.29370117, + "step": 3254, + "time_per_iteration": 2.8605175018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.04308379, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.06707664571923276, + "language_loss": 0.77650177, + "learning_rate": 0.00032379601050949193, + "loss": 0.78721887, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.28613281, + "step": 3255, + "time_per_iteration": 3.0878231525421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107032, + "balance_loss_mlp": 1.04125643, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.055802614278498724, + "language_loss": 0.8790136, + "learning_rate": 0.0003235044889614013, + "loss": 0.8897168, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.29052734, + "step": 3256, + "time_per_iteration": 2.5939788818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.04302788, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.05515134857427489, + "language_loss": 0.83577603, + "learning_rate": 0.0003232130359349451, + "loss": 0.84650195, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.29541016, + "step": 3257, + "time_per_iteration": 2.8894662857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_mlp": 1.03752887, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.05130373708668117, + "language_loss": 0.81576669, + "learning_rate": 0.0003229216515432751, + "loss": 0.82644784, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.30566406, + "step": 3258, + "time_per_iteration": 2.756706476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.04586434, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.06660247735864482, + "language_loss": 0.79725903, + "learning_rate": 0.0003226303358995174, + "loss": 0.80802286, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.3046875, + "step": 3259, + "time_per_iteration": 2.67144775390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077975, + "balance_loss_mlp": 1.04760051, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.05404958184745656, + "language_loss": 0.88993442, + "learning_rate": 0.00032233908911677, + "loss": 0.90071416, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.30322266, + "step": 3260, + "time_per_iteration": 2.863938808441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073635, + "balance_loss_mlp": 1.0435462, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.053449532753106085, + "language_loss": 0.80614489, + "learning_rate": 0.0003220479113081053, + "loss": 0.81688124, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.30053711, + "step": 3261, + "time_per_iteration": 2.7604382038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106913, + "balance_loss_mlp": 1.03846908, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.08212493062436176, + "language_loss": 0.78586102, + "learning_rate": 0.00032175680258656836, + "loss": 0.7965523, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.30615234, + "step": 3262, + "time_per_iteration": 2.6967196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071974, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.05356215085141381, + "language_loss": 0.79812634, + "learning_rate": 0.00032146576306517794, + "loss": 0.80884606, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.30029297, + "step": 3263, + "time_per_iteration": 2.8093175888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078395, + "balance_loss_mlp": 1.04866421, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.0554541143403023, + "language_loss": 0.80460787, + "learning_rate": 0.0003211747928569255, + "loss": 0.81539178, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.296875, + "step": 3264, + "time_per_iteration": 2.760589122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076242, + "balance_loss_mlp": 1.04741764, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.05014640017162604, + "language_loss": 0.81306803, + "learning_rate": 0.0003208838920747754, + "loss": 0.82383049, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.28833008, + "step": 3265, + "time_per_iteration": 2.8798112869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072039, + "balance_loss_mlp": 1.04342878, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.0653184175681376, + "language_loss": 0.7620573, + "learning_rate": 0.0003205930608316656, + "loss": 0.77277768, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.28588867, + "step": 3266, + "time_per_iteration": 3.571838140487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03900564, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.0645756575705021, + "language_loss": 0.84763867, + "learning_rate": 0.00032030229924050673, + "loss": 0.85831082, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.2824707, + "step": 3267, + "time_per_iteration": 2.6483044624328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076916, + "balance_loss_mlp": 1.04732847, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.056929311189361634, + "language_loss": 0.79781055, + "learning_rate": 0.00032001160741418247, + "loss": 0.8085798, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.2956543, + "step": 3268, + "time_per_iteration": 2.6264944076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04559875, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06099991776651708, + "language_loss": 0.82100242, + "learning_rate": 0.0003197209854655494, + "loss": 0.83175737, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.29833984, + "step": 3269, + "time_per_iteration": 2.704279661178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_mlp": 1.04454803, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.06377784920568129, + "language_loss": 0.74516416, + "learning_rate": 0.0003194304335074371, + "loss": 0.75589478, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.28515625, + "step": 3270, + "time_per_iteration": 2.82635235786438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04281116, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.054968431789037576, + "language_loss": 0.88535178, + "learning_rate": 0.0003191399516526475, + "loss": 0.89607286, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.29272461, + "step": 3271, + "time_per_iteration": 2.4927825927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074321, + "balance_loss_mlp": 1.04575849, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05221826851343204, + "language_loss": 0.79470003, + "learning_rate": 0.0003188495400139559, + "loss": 0.80544329, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.28540039, + "step": 3272, + "time_per_iteration": 2.764953851699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071949, + "balance_loss_mlp": 1.04312468, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.060799032420417454, + "language_loss": 0.84558678, + "learning_rate": 0.00031855919870411013, + "loss": 0.85630625, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.28808594, + "step": 3273, + "time_per_iteration": 2.823537588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071632, + "balance_loss_mlp": 1.04213953, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05430009118151755, + "language_loss": 0.84791374, + "learning_rate": 0.0003182689278358305, + "loss": 0.85863006, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.29443359, + "step": 3274, + "time_per_iteration": 2.6649551391601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073347, + "balance_loss_mlp": 1.04416466, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.085227141064307, + "language_loss": 0.79910004, + "learning_rate": 0.0003179787275218105, + "loss": 0.80983347, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.29174805, + "step": 3275, + "time_per_iteration": 2.563103437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074447, + "balance_loss_mlp": 1.0460037, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.07197275527111574, + "language_loss": 0.84121722, + "learning_rate": 0.0003176885978747155, + "loss": 0.85196167, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.28466797, + "step": 3276, + "time_per_iteration": 2.634556293487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076833, + "balance_loss_mlp": 1.04807937, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.05534578709936448, + "language_loss": 0.82750475, + "learning_rate": 0.0003173985390071839, + "loss": 0.83827305, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.28735352, + "step": 3277, + "time_per_iteration": 2.8998594284057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018796, + "balance_loss_mlp": 1.0069232, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.01138839518784329, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78919256, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11865234, + "step": 3278, + "time_per_iteration": 4.791780233383179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076998, + "balance_loss_mlp": 1.04678988, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.07347882473000023, + "language_loss": 0.81146979, + "learning_rate": 0.00031681863406122704, + "loss": 0.82223976, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.30151367, + "step": 3279, + "time_per_iteration": 2.7681593894958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077607, + "balance_loss_mlp": 1.0484724, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.0604928742924753, + "language_loss": 0.85127562, + "learning_rate": 0.00031652878820794087, + "loss": 0.86205173, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.29101562, + "step": 3280, + "time_per_iteration": 2.9940550327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_mlp": 1.04970694, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.06373938844251871, + "language_loss": 0.85768282, + "learning_rate": 0.00031623901358449627, + "loss": 0.86847264, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.29223633, + "step": 3281, + "time_per_iteration": 2.637016773223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080918, + "balance_loss_mlp": 1.05206895, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.0651224667912018, + "language_loss": 0.88407606, + "learning_rate": 0.0003159493103033936, + "loss": 0.89488524, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.28857422, + "step": 3282, + "time_per_iteration": 2.6074159145355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_mlp": 1.0136919, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.014583316572648261, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80944717, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.11962891, + "step": 3283, + "time_per_iteration": 4.897862195968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05183721, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.07926250214207341, + "language_loss": 0.82117367, + "learning_rate": 0.0003153701182180776, + "loss": 0.83198726, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.29443359, + "step": 3284, + "time_per_iteration": 2.773768186569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05653346, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.06299610541065176, + "language_loss": 0.81832671, + "learning_rate": 0.00031508062963872655, + "loss": 0.82917833, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.28613281, + "step": 3285, + "time_per_iteration": 2.5745344161987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_mlp": 1.05484831, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.0675003916655452, + "language_loss": 0.7940349, + "learning_rate": 0.0003147912128514423, + "loss": 0.80487257, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2890625, + "step": 3286, + "time_per_iteration": 2.736119508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088711, + "balance_loss_mlp": 1.05976713, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.055334521213686955, + "language_loss": 0.87346876, + "learning_rate": 0.0003145018679685859, + "loss": 0.88435584, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.28881836, + "step": 3287, + "time_per_iteration": 2.747880697250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_mlp": 1.05515993, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.049981399044418943, + "language_loss": 0.8773675, + "learning_rate": 0.00031421259510249134, + "loss": 0.88820541, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.28637695, + "step": 3288, + "time_per_iteration": 2.828601121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087286, + "balance_loss_mlp": 1.05898595, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05983667283250032, + "language_loss": 0.81054246, + "learning_rate": 0.00031392339436546414, + "loss": 0.82141531, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.28295898, + "step": 3289, + "time_per_iteration": 2.8950355052948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05599856, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.08046321176630551, + "language_loss": 0.83522916, + "learning_rate": 0.00031363426586978205, + "loss": 0.84606409, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27539062, + "step": 3290, + "time_per_iteration": 2.842975378036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079426, + "balance_loss_mlp": 1.05234218, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.06320614545402135, + "language_loss": 0.84556788, + "learning_rate": 0.0003133452097276947, + "loss": 0.85636216, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.27148438, + "step": 3291, + "time_per_iteration": 2.7399022579193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079638, + "balance_loss_mlp": 1.05174291, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.05133484594344534, + "language_loss": 0.83828831, + "learning_rate": 0.0003130562260514238, + "loss": 0.84908473, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.27929688, + "step": 3292, + "time_per_iteration": 2.782712936401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04538822, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.05681875015952551, + "language_loss": 0.81639814, + "learning_rate": 0.0003127673149531626, + "loss": 0.82714117, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.28881836, + "step": 3293, + "time_per_iteration": 2.8035476207733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04454613, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.24840448660881664, + "language_loss": 0.82970059, + "learning_rate": 0.0003124784765450762, + "loss": 0.84042978, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.28393555, + "step": 3294, + "time_per_iteration": 2.608938694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077527, + "balance_loss_mlp": 1.04877377, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.05797118879251517, + "language_loss": 0.80332613, + "learning_rate": 0.0003121897109393017, + "loss": 0.81410146, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.28759766, + "step": 3295, + "time_per_iteration": 2.806485414505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075453, + "balance_loss_mlp": 1.04710555, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.05731717325491985, + "language_loss": 0.89463425, + "learning_rate": 0.0003119010182479481, + "loss": 0.90538877, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.28344727, + "step": 3296, + "time_per_iteration": 2.6082053184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.04430485, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05711828874106615, + "language_loss": 0.82742012, + "learning_rate": 0.00031161239858309563, + "loss": 0.8381443, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.28149414, + "step": 3297, + "time_per_iteration": 2.563567638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076965, + "balance_loss_mlp": 1.04818797, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.06150807271743663, + "language_loss": 0.8330332, + "learning_rate": 0.0003113238520567964, + "loss": 0.84380281, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.28759766, + "step": 3298, + "time_per_iteration": 2.6396591663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04760718, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.06211731206435071, + "language_loss": 0.81525218, + "learning_rate": 0.00031103537878107403, + "loss": 0.8260048, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.27709961, + "step": 3299, + "time_per_iteration": 2.7182040214538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076081, + "balance_loss_mlp": 1.04813862, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.09008856802474977, + "language_loss": 0.80391061, + "learning_rate": 0.0003107469788679238, + "loss": 0.81467146, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.27978516, + "step": 3300, + "time_per_iteration": 2.7851805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075354, + "balance_loss_mlp": 1.04688656, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05422740840370266, + "language_loss": 0.86501485, + "learning_rate": 0.00031045865242931267, + "loss": 0.87576842, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.28466797, + "step": 3301, + "time_per_iteration": 2.810676097869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04755139, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.05423287831049679, + "language_loss": 0.82804501, + "learning_rate": 0.00031017039957717877, + "loss": 0.83880234, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.28149414, + "step": 3302, + "time_per_iteration": 3.0281054973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_mlp": 1.0450151, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.05349883160058106, + "language_loss": 0.88460255, + "learning_rate": 0.0003098822204234318, + "loss": 0.89534903, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.29589844, + "step": 3303, + "time_per_iteration": 2.666997194290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075345, + "balance_loss_mlp": 1.04713964, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.06555082687836872, + "language_loss": 0.87261242, + "learning_rate": 0.00030959411507995273, + "loss": 0.88336587, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.2824707, + "step": 3304, + "time_per_iteration": 3.197598457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_mlp": 1.04662395, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.0641703169727953, + "language_loss": 0.81063581, + "learning_rate": 0.00030930608365859407, + "loss": 0.82138741, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.28540039, + "step": 3305, + "time_per_iteration": 2.6621649265289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.04713345, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.049948399084256474, + "language_loss": 0.87610269, + "learning_rate": 0.00030901812627117943, + "loss": 0.88685155, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.27783203, + "step": 3306, + "time_per_iteration": 2.612919807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077235, + "balance_loss_mlp": 1.04826725, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06317558416619916, + "language_loss": 0.84607321, + "learning_rate": 0.000308730243029504, + "loss": 0.85684562, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.28955078, + "step": 3307, + "time_per_iteration": 2.5705294609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072567, + "balance_loss_mlp": 1.04307485, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.05685632301598214, + "language_loss": 0.79783237, + "learning_rate": 0.0003084424340453339, + "loss": 0.80855805, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.29443359, + "step": 3308, + "time_per_iteration": 2.807271957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04784167, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.05758668896734757, + "language_loss": 0.81629676, + "learning_rate": 0.0003081546994304064, + "loss": 0.82706171, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28637695, + "step": 3309, + "time_per_iteration": 2.7554562091827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076484, + "balance_loss_mlp": 1.04794574, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.06449450681570038, + "language_loss": 0.81813806, + "learning_rate": 0.0003078670392964298, + "loss": 0.82890296, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.28540039, + "step": 3310, + "time_per_iteration": 2.5969130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075995, + "balance_loss_mlp": 1.04721737, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05473972875900602, + "language_loss": 0.82840186, + "learning_rate": 0.00030757945375508406, + "loss": 0.83916187, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.28759766, + "step": 3311, + "time_per_iteration": 2.663797616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04507077, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.0598003061946429, + "language_loss": 0.8103205, + "learning_rate": 0.00030729194291801944, + "loss": 0.82106709, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.2956543, + "step": 3312, + "time_per_iteration": 2.6541266441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070179, + "balance_loss_mlp": 1.04099667, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.06742420261969287, + "language_loss": 0.77177984, + "learning_rate": 0.00030700450689685787, + "loss": 0.78248155, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.29174805, + "step": 3313, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071745, + "balance_loss_mlp": 1.0428009, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.04829069116986981, + "language_loss": 0.85252231, + "learning_rate": 0.00030671714580319186, + "loss": 0.86323977, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.28930664, + "step": 3314, + "time_per_iteration": 2.840120553970337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04618776, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.06110269335032462, + "language_loss": 0.83013022, + "learning_rate": 0.0003064298597485846, + "loss": 0.84088534, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.29296875, + "step": 3315, + "time_per_iteration": 2.852611541748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.04463601, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.058531862616109036, + "language_loss": 0.83941239, + "learning_rate": 0.00030614264884457054, + "loss": 0.85014582, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.28710938, + "step": 3316, + "time_per_iteration": 2.636786699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04429102, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.06311790142040714, + "language_loss": 0.7747215, + "learning_rate": 0.000305855513202655, + "loss": 0.78545475, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.2902832, + "step": 3317, + "time_per_iteration": 2.572878837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073954, + "balance_loss_mlp": 1.04491496, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.06648512772878035, + "language_loss": 0.77336514, + "learning_rate": 0.0003055684529343138, + "loss": 0.7841047, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.29052734, + "step": 3318, + "time_per_iteration": 2.4436564445495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072959, + "balance_loss_mlp": 1.04427767, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.17585576995025723, + "language_loss": 0.78666025, + "learning_rate": 0.00030528146815099374, + "loss": 0.79738986, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.28686523, + "step": 3319, + "time_per_iteration": 2.633169174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04463935, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.05914219973016666, + "language_loss": 0.72023094, + "learning_rate": 0.00030499455896411203, + "loss": 0.73096609, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.28881836, + "step": 3320, + "time_per_iteration": 2.6515796184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064633, + "balance_loss_mlp": 1.05213952, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.030989551650608328, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77365446, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.125, + "step": 3321, + "time_per_iteration": 4.949177980422974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.04768264, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.05124764901012802, + "language_loss": 0.76538706, + "learning_rate": 0.0003044209678251865, + "loss": 0.77615809, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.29370117, + "step": 3322, + "time_per_iteration": 2.8691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082016, + "balance_loss_mlp": 1.05257154, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.052110264896392484, + "language_loss": 0.84702694, + "learning_rate": 0.0003041342860958306, + "loss": 0.85784709, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.29443359, + "step": 3323, + "time_per_iteration": 2.764293670654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080288, + "balance_loss_mlp": 1.0508672, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.06415760622420662, + "language_loss": 0.91791111, + "learning_rate": 0.00030384768040828857, + "loss": 0.92871398, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.29418945, + "step": 3324, + "time_per_iteration": 2.676239252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_mlp": 1.05457401, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.06537046066409105, + "language_loss": 0.85248572, + "learning_rate": 0.00030356115087383094, + "loss": 0.86332518, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.29321289, + "step": 3325, + "time_per_iteration": 2.6422836780548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108456, + "balance_loss_mlp": 1.05523491, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.07261726527326764, + "language_loss": 0.85094643, + "learning_rate": 0.00030327469760369803, + "loss": 0.86179203, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.29345703, + "step": 3326, + "time_per_iteration": 2.618764877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078424, + "balance_loss_mlp": 1.04943204, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.06406701351791282, + "language_loss": 0.85019833, + "learning_rate": 0.0003029883207091009, + "loss": 0.86098254, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.28979492, + "step": 3327, + "time_per_iteration": 2.699650764465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04961252, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.0560194788269582, + "language_loss": 0.77876812, + "learning_rate": 0.00030270202030122095, + "loss": 0.78955448, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.29003906, + "step": 3328, + "time_per_iteration": 2.6756327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079179, + "balance_loss_mlp": 1.04994857, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.07533630521216038, + "language_loss": 0.86165637, + "learning_rate": 0.00030241579649121, + "loss": 0.87244821, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.29199219, + "step": 3329, + "time_per_iteration": 2.988523244857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081549, + "balance_loss_mlp": 1.05286741, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.06215732096136448, + "language_loss": 0.79335475, + "learning_rate": 0.00030212964939018994, + "loss": 0.80417025, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.28662109, + "step": 3330, + "time_per_iteration": 2.536287307739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079251, + "balance_loss_mlp": 1.05035472, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.05674161193515711, + "language_loss": 0.85566485, + "learning_rate": 0.0003018435791092527, + "loss": 0.86645734, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.28857422, + "step": 3331, + "time_per_iteration": 2.4944264888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.05191207, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.05931339185061419, + "language_loss": 0.80892223, + "learning_rate": 0.00030155758575946083, + "loss": 0.81972075, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.27954102, + "step": 3332, + "time_per_iteration": 2.6625006198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077272, + "balance_loss_mlp": 1.04797006, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.054973078138002, + "language_loss": 0.83676195, + "learning_rate": 0.0003012716694518467, + "loss": 0.84753466, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.29272461, + "step": 3333, + "time_per_iteration": 2.5685575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077896, + "balance_loss_mlp": 1.04976213, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.06333005970855973, + "language_loss": 0.84833503, + "learning_rate": 0.000300985830297413, + "loss": 0.85911405, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.28149414, + "step": 3334, + "time_per_iteration": 2.7106077671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077366, + "balance_loss_mlp": 1.04875624, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.05617575604142134, + "language_loss": 0.87391257, + "learning_rate": 0.00030070006840713205, + "loss": 0.88468629, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.28613281, + "step": 3335, + "time_per_iteration": 3.390854835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04868436, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.055765507063515254, + "language_loss": 0.73336351, + "learning_rate": 0.000300414383891947, + "loss": 0.74412954, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.27954102, + "step": 3336, + "time_per_iteration": 2.8184750080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074814, + "balance_loss_mlp": 1.04713416, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04865343351033758, + "language_loss": 0.88524318, + "learning_rate": 0.00030012877686276973, + "loss": 0.89599127, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.27709961, + "step": 3337, + "time_per_iteration": 2.693716049194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_mlp": 1.04925418, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.05071900601819844, + "language_loss": 0.8653757, + "learning_rate": 0.0002998432474304832, + "loss": 0.87615323, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.28540039, + "step": 3338, + "time_per_iteration": 2.785625696182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014062, + "balance_loss_mlp": 1.00228393, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.008511369807607439, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80251408, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.11767578, + "step": 3339, + "time_per_iteration": 4.914938688278198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072832, + "balance_loss_mlp": 1.04531896, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04920072731588192, + "language_loss": 0.88676053, + "learning_rate": 0.00029927242179996107, + "loss": 0.89748889, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.27539062, + "step": 3340, + "time_per_iteration": 2.6910037994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075049, + "balance_loss_mlp": 1.04691517, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.050397080981132346, + "language_loss": 0.83332348, + "learning_rate": 0.0002989871258233398, + "loss": 0.84407395, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.28149414, + "step": 3341, + "time_per_iteration": 2.7581868171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.05337822, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.07038127558443963, + "language_loss": 0.82547259, + "learning_rate": 0.0002987019078868373, + "loss": 0.83629274, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.28613281, + "step": 3342, + "time_per_iteration": 2.4203991889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04792297, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05404588481803156, + "language_loss": 0.81465191, + "learning_rate": 0.00029841676810118484, + "loss": 0.8254106, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.27978516, + "step": 3343, + "time_per_iteration": 2.665461778640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_mlp": 1.04489374, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05709994868865375, + "language_loss": 0.8727839, + "learning_rate": 0.0002981317065770839, + "loss": 0.88351655, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.28344727, + "step": 3344, + "time_per_iteration": 3.0409646034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074581, + "balance_loss_mlp": 1.04592359, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.0669931178788996, + "language_loss": 0.80771047, + "learning_rate": 0.00029784672342520493, + "loss": 0.81845629, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.28662109, + "step": 3345, + "time_per_iteration": 2.69077730178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_mlp": 1.04541922, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.058634487951654345, + "language_loss": 0.83929563, + "learning_rate": 0.00029756181875618834, + "loss": 0.85003328, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.28369141, + "step": 3346, + "time_per_iteration": 2.5735673904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107364, + "balance_loss_mlp": 1.04541159, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.06920918115326812, + "language_loss": 0.83749354, + "learning_rate": 0.0002972769926806439, + "loss": 0.84823, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.28222656, + "step": 3347, + "time_per_iteration": 2.480320692062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071427, + "balance_loss_mlp": 1.04248285, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05946244063191617, + "language_loss": 0.88425148, + "learning_rate": 0.0002969922453091508, + "loss": 0.89496571, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.28930664, + "step": 3348, + "time_per_iteration": 2.5937469005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04441822, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04841561291850138, + "language_loss": 0.84831715, + "learning_rate": 0.00029670757675225777, + "loss": 0.85905439, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.29248047, + "step": 3349, + "time_per_iteration": 2.7379231452941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076606, + "balance_loss_mlp": 1.04754305, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.058104314548796505, + "language_loss": 0.79157209, + "learning_rate": 0.0002964229871204831, + "loss": 0.80233824, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.2902832, + "step": 3350, + "time_per_iteration": 2.6757731437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076273, + "balance_loss_mlp": 1.04663801, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.06774074305303925, + "language_loss": 0.83398223, + "learning_rate": 0.00029613847652431403, + "loss": 0.84474498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.29614258, + "step": 3351, + "time_per_iteration": 2.905512571334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072846, + "balance_loss_mlp": 1.04409289, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.05155589011440517, + "language_loss": 0.79040021, + "learning_rate": 0.0002958540450742078, + "loss": 0.80112863, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.28735352, + "step": 3352, + "time_per_iteration": 2.929170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070119, + "balance_loss_mlp": 1.04026914, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.05063101037277444, + "language_loss": 0.77325773, + "learning_rate": 0.0002955696928805901, + "loss": 0.78395891, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2980957, + "step": 3353, + "time_per_iteration": 2.881626605987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107236, + "balance_loss_mlp": 1.04229498, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.059706275301968766, + "language_loss": 0.86282456, + "learning_rate": 0.0002952854200538563, + "loss": 0.87354815, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.30004883, + "step": 3354, + "time_per_iteration": 2.8391265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070707, + "balance_loss_mlp": 1.04047608, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.08701934847838336, + "language_loss": 0.81666923, + "learning_rate": 0.000295001226704371, + "loss": 0.82737631, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.30175781, + "step": 3355, + "time_per_iteration": 2.598177194595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.0440042, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.06424201750770815, + "language_loss": 0.82413089, + "learning_rate": 0.00029471711294246783, + "loss": 0.83487391, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.30273438, + "step": 3356, + "time_per_iteration": 2.813361644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069796, + "balance_loss_mlp": 1.03880155, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.06119276712520419, + "language_loss": 0.82436061, + "learning_rate": 0.0002944330788784494, + "loss": 0.83505857, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.30957031, + "step": 3357, + "time_per_iteration": 2.8810949325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073631, + "balance_loss_mlp": 1.04399514, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.06225888545708514, + "language_loss": 0.84205008, + "learning_rate": 0.00029414912462258786, + "loss": 0.8527863, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.29614258, + "step": 3358, + "time_per_iteration": 2.827125310897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.0442096, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.06476670861286221, + "language_loss": 0.81335187, + "learning_rate": 0.00029386525028512366, + "loss": 0.82410085, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.30664062, + "step": 3359, + "time_per_iteration": 2.750802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04195285, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05574217129277394, + "language_loss": 0.86898518, + "learning_rate": 0.0002935814559762666, + "loss": 0.87971175, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.30664062, + "step": 3360, + "time_per_iteration": 2.778729200363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071986, + "balance_loss_mlp": 1.04125416, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.05463243527184519, + "language_loss": 0.79309767, + "learning_rate": 0.0002932977418061957, + "loss": 0.80381751, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.30712891, + "step": 3361, + "time_per_iteration": 2.636300563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072531, + "balance_loss_mlp": 1.04284823, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06447019250914547, + "language_loss": 0.80627209, + "learning_rate": 0.00029301410788505833, + "loss": 0.81699741, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.29638672, + "step": 3362, + "time_per_iteration": 2.7907180786132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.04127288, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06442175719622328, + "language_loss": 0.81014264, + "learning_rate": 0.00029273055432297126, + "loss": 0.8208527, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.29711914, + "step": 3363, + "time_per_iteration": 2.5577244758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068782, + "balance_loss_mlp": 1.03835917, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.055871885274250355, + "language_loss": 0.80490357, + "learning_rate": 0.00029244708123001917, + "loss": 0.81559139, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.30395508, + "step": 3364, + "time_per_iteration": 2.938917636871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065549, + "balance_loss_mlp": 1.0347929, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.060913516619686706, + "language_loss": 0.84265661, + "learning_rate": 0.0002921636887162565, + "loss": 0.85331213, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.30737305, + "step": 3365, + "time_per_iteration": 2.7420175075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_mlp": 1.03718054, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.07220364495800281, + "language_loss": 0.84047341, + "learning_rate": 0.00029188037689170595, + "loss": 0.85114586, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.30029297, + "step": 3366, + "time_per_iteration": 2.941958427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070259, + "balance_loss_mlp": 1.04026556, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.0698232037755488, + "language_loss": 0.84047693, + "learning_rate": 0.0002915971458663586, + "loss": 0.85117948, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.29931641, + "step": 3367, + "time_per_iteration": 3.0588743686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064684, + "balance_loss_mlp": 1.03507257, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.048093531739852514, + "language_loss": 0.81804395, + "learning_rate": 0.00029131399575017494, + "loss": 0.82869077, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.2956543, + "step": 3368, + "time_per_iteration": 3.194119691848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.05082024761534885, + "language_loss": 0.85855007, + "learning_rate": 0.0002910309266530836, + "loss": 0.86920446, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.29638672, + "step": 3369, + "time_per_iteration": 2.7995903491973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069305, + "balance_loss_mlp": 1.03943157, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.06123820960940181, + "language_loss": 0.85307527, + "learning_rate": 0.0002907479386849814, + "loss": 0.86376828, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.2980957, + "step": 3370, + "time_per_iteration": 2.6561813354492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03969884, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.06023552594522319, + "language_loss": 0.8010959, + "learning_rate": 0.0002904650319557339, + "loss": 0.81179738, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.30395508, + "step": 3371, + "time_per_iteration": 3.0036118030548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.03967094, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.06478850515629742, + "language_loss": 0.81106675, + "learning_rate": 0.0002901822065751758, + "loss": 0.82175934, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.29541016, + "step": 3372, + "time_per_iteration": 2.6287784576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.0429343, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.0516174175681091, + "language_loss": 0.854002, + "learning_rate": 0.0002898994626531093, + "loss": 0.86473012, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.29833984, + "step": 3373, + "time_per_iteration": 2.84863543510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071305, + "balance_loss_mlp": 1.04181266, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.07661916167941812, + "language_loss": 0.88111019, + "learning_rate": 0.00028961680029930526, + "loss": 0.89182317, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.29443359, + "step": 3374, + "time_per_iteration": 2.5185511112213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03965008, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.05286852382904046, + "language_loss": 0.76929349, + "learning_rate": 0.00028933421962350317, + "loss": 0.77998275, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.29248047, + "step": 3375, + "time_per_iteration": 2.7406935691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071172, + "balance_loss_mlp": 1.04020166, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.05602089532541189, + "language_loss": 0.84000719, + "learning_rate": 0.0002890517207354104, + "loss": 0.85071886, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.30932617, + "step": 3376, + "time_per_iteration": 2.8145668506622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072679, + "balance_loss_mlp": 1.04263854, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.05675413090178792, + "language_loss": 0.81828344, + "learning_rate": 0.0002887693037447029, + "loss": 0.82901019, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.30004883, + "step": 3377, + "time_per_iteration": 2.6432199478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070436, + "balance_loss_mlp": 1.04082441, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05935135112647285, + "language_loss": 0.82021838, + "learning_rate": 0.00028848696876102443, + "loss": 0.83092278, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.29541016, + "step": 3378, + "time_per_iteration": 2.6862215995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065633, + "balance_loss_mlp": 1.03473437, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.06179409995476596, + "language_loss": 0.83523512, + "learning_rate": 0.00028820471589398723, + "loss": 0.84589148, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.30859375, + "step": 3379, + "time_per_iteration": 2.5718047618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070203, + "balance_loss_mlp": 1.03970945, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06289552232740542, + "language_loss": 0.77402478, + "learning_rate": 0.00028792254525317196, + "loss": 0.78472686, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.30493164, + "step": 3380, + "time_per_iteration": 2.779308795928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071743, + "balance_loss_mlp": 1.0420599, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05486106257478186, + "language_loss": 0.81240368, + "learning_rate": 0.00028764045694812645, + "loss": 0.82312119, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.29638672, + "step": 3381, + "time_per_iteration": 2.7430598735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010701, + "balance_loss_mlp": 1.03936744, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.061364553922665516, + "language_loss": 0.76195431, + "learning_rate": 0.0002873584510883671, + "loss": 0.77265531, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.30688477, + "step": 3382, + "time_per_iteration": 2.575998306274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071659, + "balance_loss_mlp": 1.04085565, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.0719487575879366, + "language_loss": 0.85928071, + "learning_rate": 0.0002870765277833788, + "loss": 0.86999726, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.30761719, + "step": 3383, + "time_per_iteration": 2.7900807857513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.03790629, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.06613356509687102, + "language_loss": 0.80323064, + "learning_rate": 0.00028679468714261347, + "loss": 0.81392419, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.31445312, + "step": 3384, + "time_per_iteration": 2.7730093002319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04132867, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.06288254960309916, + "language_loss": 0.76734459, + "learning_rate": 0.0002865129292754918, + "loss": 0.77805495, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.29663086, + "step": 3385, + "time_per_iteration": 2.6205520629882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075067, + "balance_loss_mlp": 1.04500234, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.05411679726730615, + "language_loss": 0.81513727, + "learning_rate": 0.00028623125429140105, + "loss": 0.82588792, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.30004883, + "step": 3386, + "time_per_iteration": 2.88822340965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.03826463, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.05765553092239875, + "language_loss": 0.87005818, + "learning_rate": 0.00028594966229969785, + "loss": 0.88073337, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.29223633, + "step": 3387, + "time_per_iteration": 2.6889727115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074347, + "balance_loss_mlp": 1.04413986, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.05935709634506938, + "language_loss": 0.81214345, + "learning_rate": 0.00028566815340970577, + "loss": 0.82288694, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.30151367, + "step": 3388, + "time_per_iteration": 2.7500782012939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107152, + "balance_loss_mlp": 1.04195595, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.058132495029724875, + "language_loss": 0.8099978, + "learning_rate": 0.0002853867277307162, + "loss": 0.82071304, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.29516602, + "step": 3389, + "time_per_iteration": 2.628153085708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072178, + "balance_loss_mlp": 1.04399705, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.062440592290717876, + "language_loss": 0.82432795, + "learning_rate": 0.00028510538537198824, + "loss": 0.83504969, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.28198242, + "step": 3390, + "time_per_iteration": 2.6273562908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.04805326, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.0630008208317628, + "language_loss": 0.86511409, + "learning_rate": 0.00028482412644274867, + "loss": 0.87588215, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.28759766, + "step": 3391, + "time_per_iteration": 2.986837148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073216, + "balance_loss_mlp": 1.04479647, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.07544653210913753, + "language_loss": 0.74115705, + "learning_rate": 0.00028454295105219207, + "loss": 0.75188923, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.28417969, + "step": 3392, + "time_per_iteration": 2.6882169246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077343, + "balance_loss_mlp": 1.04837489, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044597775660838994, + "language_loss": 0.79517299, + "learning_rate": 0.0002842618593094802, + "loss": 0.80594641, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.28979492, + "step": 3393, + "time_per_iteration": 3.160513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076464, + "balance_loss_mlp": 1.04785347, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.0655151623947296, + "language_loss": 0.80225992, + "learning_rate": 0.00028398085132374243, + "loss": 0.81302458, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.28588867, + "step": 3394, + "time_per_iteration": 2.799607753753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04861116, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.057447645264245936, + "language_loss": 0.83968282, + "learning_rate": 0.0002836999272040761, + "loss": 0.85044694, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.27832031, + "step": 3395, + "time_per_iteration": 3.1404569149017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076476, + "balance_loss_mlp": 1.04753208, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.07221192979592671, + "language_loss": 0.83835298, + "learning_rate": 0.00028341908705954575, + "loss": 0.84911776, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.28955078, + "step": 3396, + "time_per_iteration": 2.586735248565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_mlp": 1.01340032, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.010103591992015052, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82786608, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.11376953, + "step": 3397, + "time_per_iteration": 4.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076371, + "balance_loss_mlp": 1.04754591, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.06325367812107179, + "language_loss": 0.78003663, + "learning_rate": 0.00028285765913198604, + "loss": 0.79080033, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.2878418, + "step": 3398, + "time_per_iteration": 2.583195209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073367, + "balance_loss_mlp": 1.04530561, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.055960254103937936, + "language_loss": 0.81894422, + "learning_rate": 0.0002825770715669227, + "loss": 0.82967794, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.28076172, + "step": 3399, + "time_per_iteration": 2.706880569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04842257, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.06150139712068683, + "language_loss": 0.80872452, + "learning_rate": 0.00028229656841292634, + "loss": 0.81948054, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.2722168, + "step": 3400, + "time_per_iteration": 2.6799252033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075202, + "balance_loss_mlp": 1.04687786, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.0638413236687058, + "language_loss": 0.76758403, + "learning_rate": 0.0002820161497788979, + "loss": 0.77833605, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.28320312, + "step": 3401, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04712176, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.051478933847507014, + "language_loss": 0.87136239, + "learning_rate": 0.00028173581577370545, + "loss": 0.88210893, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.27563477, + "step": 3402, + "time_per_iteration": 2.7428696155548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.04618084, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.05196967996925013, + "language_loss": 0.79016143, + "learning_rate": 0.0002814555665061844, + "loss": 0.80089623, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.2734375, + "step": 3403, + "time_per_iteration": 2.68853759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076544, + "balance_loss_mlp": 1.04914951, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.06812490536784549, + "language_loss": 0.77581179, + "learning_rate": 0.00028117540208513715, + "loss": 0.78657722, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.27416992, + "step": 3404, + "time_per_iteration": 2.668957233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0468924, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.06109241421727743, + "language_loss": 0.85329819, + "learning_rate": 0.00028089532261933313, + "loss": 0.86404049, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.27368164, + "step": 3405, + "time_per_iteration": 2.764646053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077427, + "balance_loss_mlp": 1.04910326, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.07801432785219843, + "language_loss": 0.85569102, + "learning_rate": 0.0002806153282175087, + "loss": 0.86646521, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.28369141, + "step": 3406, + "time_per_iteration": 2.612542152404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073707, + "balance_loss_mlp": 1.04547811, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.06580250942385472, + "language_loss": 0.82821441, + "learning_rate": 0.0002803354189883679, + "loss": 0.83895147, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.28222656, + "step": 3407, + "time_per_iteration": 2.8573250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.0526377, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.04760286447801195, + "language_loss": 0.8549965, + "learning_rate": 0.00028005559504058053, + "loss": 0.86579633, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.27392578, + "step": 3408, + "time_per_iteration": 2.723130941390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075013, + "balance_loss_mlp": 1.04623616, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05982952663886069, + "language_loss": 0.76448226, + "learning_rate": 0.0002797758564827838, + "loss": 0.77523243, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.28759766, + "step": 3409, + "time_per_iteration": 2.8227314949035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077669, + "balance_loss_mlp": 1.04989326, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.0665853509575856, + "language_loss": 0.83799911, + "learning_rate": 0.0002794962034235824, + "loss": 0.8487758, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.27783203, + "step": 3410, + "time_per_iteration": 2.6031951904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04303622, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.05829437169655771, + "language_loss": 0.74215448, + "learning_rate": 0.00027921663597154695, + "loss": 0.75286669, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.28198242, + "step": 3411, + "time_per_iteration": 2.735642910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04981232, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.0845273006742278, + "language_loss": 0.8108443, + "learning_rate": 0.00027893715423521525, + "loss": 0.8216204, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.27832031, + "step": 3412, + "time_per_iteration": 2.4407780170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079935, + "balance_loss_mlp": 1.05134881, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.06735556448920854, + "language_loss": 0.83940005, + "learning_rate": 0.00027865775832309163, + "loss": 0.85019946, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.28564453, + "step": 3413, + "time_per_iteration": 2.6473381519317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076667, + "balance_loss_mlp": 1.04870033, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.0593593517708546, + "language_loss": 0.85890168, + "learning_rate": 0.00027837844834364733, + "loss": 0.86966836, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.27978516, + "step": 3414, + "time_per_iteration": 2.632337808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074793, + "balance_loss_mlp": 1.04663622, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.056143783747438114, + "language_loss": 0.86344767, + "learning_rate": 0.00027809922440532, + "loss": 0.87419558, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.28173828, + "step": 3415, + "time_per_iteration": 2.8158276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.04152656, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.052293686608573205, + "language_loss": 0.80653661, + "learning_rate": 0.00027782008661651406, + "loss": 0.81724513, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.29272461, + "step": 3416, + "time_per_iteration": 2.769740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075321, + "balance_loss_mlp": 1.04706836, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.047338775202516, + "language_loss": 0.87086004, + "learning_rate": 0.00027754103508560013, + "loss": 0.88161325, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.2824707, + "step": 3417, + "time_per_iteration": 2.5982823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070746, + "balance_loss_mlp": 1.04204035, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.07606703809766882, + "language_loss": 0.82847452, + "learning_rate": 0.0002772620699209163, + "loss": 0.83918196, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.28686523, + "step": 3418, + "time_per_iteration": 2.5715713500976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072273, + "balance_loss_mlp": 1.04387712, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.06477726519797523, + "language_loss": 0.79822147, + "learning_rate": 0.0002769831912307658, + "loss": 0.80894423, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.28393555, + "step": 3419, + "time_per_iteration": 2.554229974746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081387, + "balance_loss_mlp": 1.05339622, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.06482840979987209, + "language_loss": 0.80168855, + "learning_rate": 0.00027670439912341917, + "loss": 0.81250238, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.2800293, + "step": 3420, + "time_per_iteration": 2.6077942848205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.05385685, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.062198061395391364, + "language_loss": 0.83608246, + "learning_rate": 0.0002764256937071129, + "loss": 0.8469131, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.29199219, + "step": 3421, + "time_per_iteration": 2.7814555168151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079993, + "balance_loss_mlp": 1.0516932, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.06741584728715999, + "language_loss": 0.87078255, + "learning_rate": 0.00027614707509005036, + "loss": 0.8815825, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.28320312, + "step": 3422, + "time_per_iteration": 2.6582610607147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080132, + "balance_loss_mlp": 1.05216599, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.05422221992549149, + "language_loss": 0.79046404, + "learning_rate": 0.0002758685433804008, + "loss": 0.80126542, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.2800293, + "step": 3423, + "time_per_iteration": 2.518541097640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080526, + "balance_loss_mlp": 1.05196333, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.07879518089190286, + "language_loss": 0.79578894, + "learning_rate": 0.00027559009868630005, + "loss": 0.80659419, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.28564453, + "step": 3424, + "time_per_iteration": 3.0996036529541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079504, + "balance_loss_mlp": 1.0518713, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05918528826128724, + "language_loss": 0.79852736, + "learning_rate": 0.0002753117411158491, + "loss": 0.80932236, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.27661133, + "step": 3425, + "time_per_iteration": 3.0297467708587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082154, + "balance_loss_mlp": 1.05392551, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.05414938091888711, + "language_loss": 0.89781225, + "learning_rate": 0.0002750334707771168, + "loss": 0.90863383, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.2824707, + "step": 3426, + "time_per_iteration": 2.639045476913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082665, + "balance_loss_mlp": 1.05364943, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.06850883476210408, + "language_loss": 0.8080318, + "learning_rate": 0.0002747552877781369, + "loss": 0.81885844, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.28979492, + "step": 3427, + "time_per_iteration": 2.49623966217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.04967833, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.05956339540339285, + "language_loss": 0.81955504, + "learning_rate": 0.0002744771922269097, + "loss": 0.83032882, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.27709961, + "step": 3428, + "time_per_iteration": 2.730713129043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071709, + "balance_loss_mlp": 1.04276502, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.06328482299945191, + "language_loss": 0.82119536, + "learning_rate": 0.0002741991842314015, + "loss": 0.83191252, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.28930664, + "step": 3429, + "time_per_iteration": 3.479928970336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072277, + "balance_loss_mlp": 1.04433429, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05605661810668252, + "language_loss": 0.85796869, + "learning_rate": 0.0002739212638995445, + "loss": 0.86869144, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.27954102, + "step": 3430, + "time_per_iteration": 2.606570243835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04579639, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.06049343964764478, + "language_loss": 0.82845342, + "learning_rate": 0.00027364343133923696, + "loss": 0.83919537, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.28393555, + "step": 3431, + "time_per_iteration": 2.670698642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010757, + "balance_loss_mlp": 1.04632664, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.060306061289427934, + "language_loss": 0.8290168, + "learning_rate": 0.0002733656866583431, + "loss": 0.83977377, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.29321289, + "step": 3432, + "time_per_iteration": 2.6917898654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107317, + "balance_loss_mlp": 1.04413056, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.07899452936934231, + "language_loss": 0.83071327, + "learning_rate": 0.0002730880299646927, + "loss": 0.84144497, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.2902832, + "step": 3433, + "time_per_iteration": 3.028512954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.03898394, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05867349384550741, + "language_loss": 0.85263318, + "learning_rate": 0.0002728104613660821, + "loss": 0.86331582, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.29272461, + "step": 3434, + "time_per_iteration": 2.8600428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.04666591, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.08754685065456504, + "language_loss": 0.82922065, + "learning_rate": 0.0002725329809702729, + "loss": 0.83996743, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.28051758, + "step": 3435, + "time_per_iteration": 3.2159268856048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.04002786, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06770839009461412, + "language_loss": 0.76381433, + "learning_rate": 0.0002722555888849921, + "loss": 0.77449906, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.28417969, + "step": 3436, + "time_per_iteration": 3.435774564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071105, + "balance_loss_mlp": 1.04297185, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05996981510942144, + "language_loss": 0.8029291, + "learning_rate": 0.00027197828521793334, + "loss": 0.81364018, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.28125, + "step": 3437, + "time_per_iteration": 2.5626087188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04681671, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.059440388308285685, + "language_loss": 0.84535551, + "learning_rate": 0.0002717010700767552, + "loss": 0.85612053, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.29614258, + "step": 3438, + "time_per_iteration": 2.74114990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071656, + "balance_loss_mlp": 1.04254496, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.07105561276386183, + "language_loss": 0.7574169, + "learning_rate": 0.00027142394356908226, + "loss": 0.76813346, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.29077148, + "step": 3439, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107167, + "balance_loss_mlp": 1.04289341, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.061991918055260026, + "language_loss": 0.84383535, + "learning_rate": 0.00027114690580250456, + "loss": 0.85455203, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.2878418, + "step": 3440, + "time_per_iteration": 2.770521879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068436, + "balance_loss_mlp": 1.03996921, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.055271996541099454, + "language_loss": 0.8711971, + "learning_rate": 0.0002708699568845776, + "loss": 0.88188148, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.28466797, + "step": 3441, + "time_per_iteration": 2.634669303894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020343, + "balance_loss_mlp": 1.00923228, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.011806654304651203, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80308127, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.11132812, + "step": 3442, + "time_per_iteration": 4.947353363037109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074491, + "balance_loss_mlp": 1.04609489, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.055374659837301436, + "language_loss": 0.82784879, + "learning_rate": 0.0002703163260247261, + "loss": 0.83859372, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.28369141, + "step": 3443, + "time_per_iteration": 2.664637804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069476, + "balance_loss_mlp": 1.04041255, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.06501168506799739, + "language_loss": 0.81707942, + "learning_rate": 0.0002700396442977399, + "loss": 0.82777417, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.2902832, + "step": 3444, + "time_per_iteration": 2.616928815841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04049635, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.054380463480794276, + "language_loss": 0.84038997, + "learning_rate": 0.0002697630518492817, + "loss": 0.85108292, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.28833008, + "step": 3445, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071356, + "balance_loss_mlp": 1.04207826, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.06943834744074738, + "language_loss": 0.85656464, + "learning_rate": 0.0002694865487867343, + "loss": 0.86727822, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.29223633, + "step": 3446, + "time_per_iteration": 2.624187707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072189, + "balance_loss_mlp": 1.04241085, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.05377374950460666, + "language_loss": 0.84776872, + "learning_rate": 0.0002692101352174453, + "loss": 0.85849059, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.29736328, + "step": 3447, + "time_per_iteration": 2.786705255508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066769, + "balance_loss_mlp": 1.03823054, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.06088849613608419, + "language_loss": 0.84652716, + "learning_rate": 0.00026893381124872787, + "loss": 0.8571949, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.28515625, + "step": 3448, + "time_per_iteration": 2.8100626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072364, + "balance_loss_mlp": 1.04272866, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.06845751497679059, + "language_loss": 0.80441087, + "learning_rate": 0.00026865757698786097, + "loss": 0.81513453, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.29589844, + "step": 3449, + "time_per_iteration": 3.046318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069481, + "balance_loss_mlp": 1.04065669, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05206136562356657, + "language_loss": 0.81613761, + "learning_rate": 0.000268381432542088, + "loss": 0.82683241, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.28833008, + "step": 3450, + "time_per_iteration": 2.865903854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107193, + "balance_loss_mlp": 1.04203212, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.0645327848257149, + "language_loss": 0.79875302, + "learning_rate": 0.00026810537801861807, + "loss": 0.80947232, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.29882812, + "step": 3451, + "time_per_iteration": 2.8374693393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071564, + "balance_loss_mlp": 1.04173839, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.05151691249818879, + "language_loss": 0.8142612, + "learning_rate": 0.0002678294135246243, + "loss": 0.82497692, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.2980957, + "step": 3452, + "time_per_iteration": 2.839822769165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.04313636, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05848171422306997, + "language_loss": 0.86315292, + "learning_rate": 0.0002675535391672463, + "loss": 0.87387323, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.2890625, + "step": 3453, + "time_per_iteration": 3.184783458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074712, + "balance_loss_mlp": 1.04574442, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.06167080451779571, + "language_loss": 0.86087596, + "learning_rate": 0.0002672777550535877, + "loss": 0.8716231, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.28979492, + "step": 3454, + "time_per_iteration": 2.8803153038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071993, + "balance_loss_mlp": 1.0427866, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05419695506055875, + "language_loss": 0.84890383, + "learning_rate": 0.00026700206129071747, + "loss": 0.85962379, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.29174805, + "step": 3455, + "time_per_iteration": 2.835059881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076439, + "balance_loss_mlp": 1.04749477, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.06321625044537839, + "language_loss": 0.88953322, + "learning_rate": 0.00026672645798566925, + "loss": 0.90029758, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.28930664, + "step": 3456, + "time_per_iteration": 3.0997443199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071835, + "balance_loss_mlp": 1.04277229, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.055285478182730885, + "language_loss": 0.79457712, + "learning_rate": 0.00026645094524544225, + "loss": 0.80529541, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.2902832, + "step": 3457, + "time_per_iteration": 3.513991117477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107703, + "balance_loss_mlp": 1.0481813, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.045511024743111715, + "language_loss": 0.75222224, + "learning_rate": 0.00026617552317699945, + "loss": 0.7629925, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.28833008, + "step": 3458, + "time_per_iteration": 3.5000369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_mlp": 1.04062915, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.0575678465485099, + "language_loss": 0.8684063, + "learning_rate": 0.0002659001918872693, + "loss": 0.87909818, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.28564453, + "step": 3459, + "time_per_iteration": 3.1579606533050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076447, + "balance_loss_mlp": 1.04797983, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.057947477452726895, + "language_loss": 0.80655402, + "learning_rate": 0.0002656249514831449, + "loss": 0.8173185, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.28466797, + "step": 3460, + "time_per_iteration": 3.0136172771453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075105, + "balance_loss_mlp": 1.04527879, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05880599704270715, + "language_loss": 0.86742055, + "learning_rate": 0.00026534980207148416, + "loss": 0.87817168, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.2980957, + "step": 3461, + "time_per_iteration": 3.808920383453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070751, + "balance_loss_mlp": 1.04256988, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06394653558237288, + "language_loss": 0.73634577, + "learning_rate": 0.0002650747437591097, + "loss": 0.74705327, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.28149414, + "step": 3462, + "time_per_iteration": 3.4438018798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023937, + "balance_loss_mlp": 1.01258874, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.01627441049927099, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82903516, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.11328125, + "step": 3463, + "time_per_iteration": 5.9989097118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069258, + "balance_loss_mlp": 1.04091001, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.05970416842123876, + "language_loss": 0.86439729, + "learning_rate": 0.00026452490085933155, + "loss": 0.87508994, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.28393555, + "step": 3464, + "time_per_iteration": 3.074321985244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069725, + "balance_loss_mlp": 1.04099607, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.06389669613772958, + "language_loss": 0.89814323, + "learning_rate": 0.00026425011648539614, + "loss": 0.90884054, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.28735352, + "step": 3465, + "time_per_iteration": 3.163724422454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067748, + "balance_loss_mlp": 1.0391376, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05866867334399115, + "language_loss": 0.82531869, + "learning_rate": 0.00026397542363768267, + "loss": 0.83599609, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.28588867, + "step": 3466, + "time_per_iteration": 3.15535044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107073, + "balance_loss_mlp": 1.04202461, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.09718909208334105, + "language_loss": 0.81696969, + "learning_rate": 0.0002637008224228362, + "loss": 0.82767701, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.28710938, + "step": 3467, + "time_per_iteration": 3.1590065956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_mlp": 1.04467225, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.045698097527158366, + "language_loss": 0.84370708, + "learning_rate": 0.00026342631294746653, + "loss": 0.85443497, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.28100586, + "step": 3468, + "time_per_iteration": 3.2474896907806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_mlp": 1.03933835, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.048489338364625344, + "language_loss": 0.80841875, + "learning_rate": 0.0002631518953181476, + "loss": 0.81909585, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.28369141, + "step": 3469, + "time_per_iteration": 3.989240884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020296, + "balance_loss_mlp": 1.00837493, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.017053008774153198, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7734558, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.11914062, + "step": 3470, + "time_per_iteration": 5.7656426429748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079857, + "balance_loss_mlp": 1.05081761, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.06105820471136532, + "language_loss": 0.80315661, + "learning_rate": 0.00026260333602377985, + "loss": 0.81395519, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.29003906, + "step": 3471, + "time_per_iteration": 3.3436222076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072083, + "balance_loss_mlp": 1.04383063, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.05421906937668894, + "language_loss": 0.87085468, + "learning_rate": 0.0002623291945717007, + "loss": 0.88157558, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.28271484, + "step": 3472, + "time_per_iteration": 3.1183881759643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071602, + "balance_loss_mlp": 1.04234779, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.04666604751333496, + "language_loss": 0.84075844, + "learning_rate": 0.00026205514539161175, + "loss": 0.85147452, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.29248047, + "step": 3473, + "time_per_iteration": 3.790060043334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04386711, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.05776060177542925, + "language_loss": 0.84147954, + "learning_rate": 0.00026178118858990773, + "loss": 0.85220551, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.28686523, + "step": 3474, + "time_per_iteration": 3.4138669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071797, + "balance_loss_mlp": 1.04259038, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.05528533566381529, + "language_loss": 0.83995008, + "learning_rate": 0.0002615073242729483, + "loss": 0.85066801, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.29223633, + "step": 3475, + "time_per_iteration": 3.199012279510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.0421505, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.04758123025754447, + "language_loss": 0.84358716, + "learning_rate": 0.0002612335525470573, + "loss": 0.85429692, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.2878418, + "step": 3476, + "time_per_iteration": 3.4972333908081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04572678, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.06222514745321995, + "language_loss": 0.78151464, + "learning_rate": 0.0002609598735185221, + "loss": 0.79225659, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.28466797, + "step": 3477, + "time_per_iteration": 3.1121668815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.04186535, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.05831077718695847, + "language_loss": 0.83306509, + "learning_rate": 0.00026068628729359445, + "loss": 0.84377104, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.28686523, + "step": 3478, + "time_per_iteration": 3.4748337268829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075594, + "balance_loss_mlp": 1.04653037, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.053072339735848705, + "language_loss": 0.75823909, + "learning_rate": 0.00026041279397848996, + "loss": 0.76899505, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.29003906, + "step": 3479, + "time_per_iteration": 3.3513095378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071758, + "balance_loss_mlp": 1.04279053, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.11523786601732237, + "language_loss": 0.82653117, + "learning_rate": 0.00026013939367938797, + "loss": 0.83724874, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.28930664, + "step": 3480, + "time_per_iteration": 3.341496467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.0417881, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05240024743638074, + "language_loss": 0.81095958, + "learning_rate": 0.00025986608650243204, + "loss": 0.82166409, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.28613281, + "step": 3481, + "time_per_iteration": 3.534395933151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073143, + "balance_loss_mlp": 1.04417491, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.04897639091923761, + "language_loss": 0.79360926, + "learning_rate": 0.0002595928725537293, + "loss": 0.80434066, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.28930664, + "step": 3482, + "time_per_iteration": 3.4163737297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.04179811, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05847572955345742, + "language_loss": 0.88153374, + "learning_rate": 0.0002593197519393509, + "loss": 0.89223981, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.28833008, + "step": 3483, + "time_per_iteration": 3.162363052368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.03851843, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.04895962963004684, + "language_loss": 0.79643184, + "learning_rate": 0.00025904672476533165, + "loss": 0.80710858, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.29125977, + "step": 3484, + "time_per_iteration": 3.329540967941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.0442394, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.055271412051917726, + "language_loss": 0.82509005, + "learning_rate": 0.0002587737911376704, + "loss": 0.8358202, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.28759766, + "step": 3485, + "time_per_iteration": 3.2979683876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04063249, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.05525585278416293, + "language_loss": 0.8399781, + "learning_rate": 0.00025850095116232885, + "loss": 0.85067225, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.28759766, + "step": 3486, + "time_per_iteration": 3.26407790184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069925, + "balance_loss_mlp": 1.04012239, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.05884470939634603, + "language_loss": 0.78008693, + "learning_rate": 0.000258228204945233, + "loss": 0.79078615, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.29760742, + "step": 3487, + "time_per_iteration": 3.2713074684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069596, + "balance_loss_mlp": 1.04122472, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.08825995079793632, + "language_loss": 0.84371996, + "learning_rate": 0.00025795555259227254, + "loss": 0.85441601, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.28369141, + "step": 3488, + "time_per_iteration": 3.2798845767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.04253244, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.04912618775842026, + "language_loss": 0.8368836, + "learning_rate": 0.00025768299420930046, + "loss": 0.84759241, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.28369141, + "step": 3489, + "time_per_iteration": 3.548513174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070862, + "balance_loss_mlp": 1.04191756, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.0542630721977733, + "language_loss": 0.83150196, + "learning_rate": 0.0002574105299021332, + "loss": 0.84221053, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.28930664, + "step": 3490, + "time_per_iteration": 3.264068365097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072429, + "balance_loss_mlp": 1.04398608, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.04887866872345111, + "language_loss": 0.84103191, + "learning_rate": 0.00025713815977655084, + "loss": 0.85175616, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.28466797, + "step": 3491, + "time_per_iteration": 3.480595827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067719, + "balance_loss_mlp": 1.03848863, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.061790986714500215, + "language_loss": 0.84740448, + "learning_rate": 0.0002568658839382969, + "loss": 0.8580817, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.29199219, + "step": 3492, + "time_per_iteration": 3.149390935897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04366422, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.060742623870238814, + "language_loss": 0.84422779, + "learning_rate": 0.00025659370249307814, + "loss": 0.85494649, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.28198242, + "step": 3493, + "time_per_iteration": 3.043328285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067893, + "balance_loss_mlp": 1.03840065, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.32090754121455606, + "language_loss": 0.85042375, + "learning_rate": 0.00025632161554656473, + "loss": 0.86110264, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.29492188, + "step": 3494, + "time_per_iteration": 3.370725393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_mlp": 1.04256368, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.056395041319593345, + "language_loss": 0.82224226, + "learning_rate": 0.00025604962320439017, + "loss": 0.8329578, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.28955078, + "step": 3495, + "time_per_iteration": 3.1383168697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069781, + "balance_loss_mlp": 1.04155231, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.05570764429404915, + "language_loss": 0.82211316, + "learning_rate": 0.0002557777255721516, + "loss": 0.832811, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2824707, + "step": 3496, + "time_per_iteration": 3.2747058868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073188, + "balance_loss_mlp": 1.0451498, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.06368144256739344, + "language_loss": 0.8063888, + "learning_rate": 0.0002555059227554087, + "loss": 0.81712067, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.28027344, + "step": 3497, + "time_per_iteration": 3.241708278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078052, + "balance_loss_mlp": 1.04920387, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.05624574913237251, + "language_loss": 0.77828801, + "learning_rate": 0.00025523421485968453, + "loss": 0.78906852, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.28833008, + "step": 3498, + "time_per_iteration": 3.4185025691986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071507, + "balance_loss_mlp": 1.04327822, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05832714819515366, + "language_loss": 0.85479802, + "learning_rate": 0.00025496260199046585, + "loss": 0.86551309, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.28271484, + "step": 3499, + "time_per_iteration": 3.398684501647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04531085, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.0606160593453579, + "language_loss": 0.84417158, + "learning_rate": 0.000254691084253202, + "loss": 0.85491526, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.29052734, + "step": 3500, + "time_per_iteration": 3.204657554626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075309, + "balance_loss_mlp": 1.04641259, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.05651280486547688, + "language_loss": 0.7721619, + "learning_rate": 0.00025441966175330567, + "loss": 0.782915, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2890625, + "step": 3501, + "time_per_iteration": 3.280398368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079946, + "balance_loss_mlp": 1.05078757, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.09712144532107508, + "language_loss": 0.79372454, + "learning_rate": 0.00025414833459615183, + "loss": 0.804524, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.29174805, + "step": 3502, + "time_per_iteration": 3.221496343612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079859, + "balance_loss_mlp": 1.0510819, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05864951358988012, + "language_loss": 0.80395651, + "learning_rate": 0.0002538771028870796, + "loss": 0.81475508, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.28759766, + "step": 3503, + "time_per_iteration": 3.3205838203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075878, + "balance_loss_mlp": 1.04710114, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.060463290728931994, + "language_loss": 0.81723624, + "learning_rate": 0.0002536059667313903, + "loss": 0.827995, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2878418, + "step": 3504, + "time_per_iteration": 3.39898419380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04415321, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056146401144420426, + "language_loss": 0.89261472, + "learning_rate": 0.0002533349262343483, + "loss": 0.90334713, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.29077148, + "step": 3505, + "time_per_iteration": 3.3431026935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.04342639, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.0612472301672692, + "language_loss": 0.82005084, + "learning_rate": 0.0002530639815011807, + "loss": 0.83077168, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.28662109, + "step": 3506, + "time_per_iteration": 2.985283374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070171, + "balance_loss_mlp": 1.04220426, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.059607136715137135, + "language_loss": 0.84537947, + "learning_rate": 0.0002527931326370781, + "loss": 0.85608113, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.27978516, + "step": 3507, + "time_per_iteration": 3.1282057762145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071183, + "balance_loss_mlp": 1.04271555, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05533021024656612, + "language_loss": 0.82755983, + "learning_rate": 0.00025252237974719276, + "loss": 0.83827162, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.28491211, + "step": 3508, + "time_per_iteration": 3.260610580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066579, + "balance_loss_mlp": 1.03813529, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05860673503825768, + "language_loss": 0.80004764, + "learning_rate": 0.00025225172293664056, + "loss": 0.81071347, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.28442383, + "step": 3509, + "time_per_iteration": 3.373530864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.00540209, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.014769475443499856, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77950692, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.12158203, + "step": 3510, + "time_per_iteration": 6.158355951309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080364, + "balance_loss_mlp": 1.05111003, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06842841117996161, + "language_loss": 0.84400952, + "learning_rate": 0.00025171069797381106, + "loss": 0.8548131, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.29248047, + "step": 3511, + "time_per_iteration": 3.2980220317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070527, + "balance_loss_mlp": 1.04234552, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.0575194424100886, + "language_loss": 0.81909519, + "learning_rate": 0.00025144033003157864, + "loss": 0.82980049, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.28173828, + "step": 3512, + "time_per_iteration": 3.140373706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071116, + "balance_loss_mlp": 1.04319715, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.07351376561683495, + "language_loss": 0.78513837, + "learning_rate": 0.00025117005858876806, + "loss": 0.7958495, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.27978516, + "step": 3513, + "time_per_iteration": 3.3946895599365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070978, + "balance_loss_mlp": 1.04212952, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.056817312971520956, + "language_loss": 0.85350752, + "learning_rate": 0.000250899883750308, + "loss": 0.86421728, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.28881836, + "step": 3514, + "time_per_iteration": 3.2081196308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071843, + "balance_loss_mlp": 1.04368556, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.05856137084704242, + "language_loss": 0.81469542, + "learning_rate": 0.00025062980562109006, + "loss": 0.82541388, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.28173828, + "step": 3515, + "time_per_iteration": 3.234687566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070317, + "balance_loss_mlp": 1.04268479, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.0684742974897707, + "language_loss": 0.8283475, + "learning_rate": 0.0002503598243059677, + "loss": 0.83905065, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.27685547, + "step": 3516, + "time_per_iteration": 3.276319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.04684663, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.05816726448499056, + "language_loss": 0.80307925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81382906, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.28149414, + "step": 3517, + "time_per_iteration": 3.361901044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073931, + "balance_loss_mlp": 1.0454638, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.06530995059631492, + "language_loss": 0.85096681, + "learning_rate": 0.0002498201525372359, + "loss": 0.86170614, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.28491211, + "step": 3518, + "time_per_iteration": 3.10380220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010719, + "balance_loss_mlp": 1.04421926, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.061284941283787836, + "language_loss": 0.83024853, + "learning_rate": 0.00024955046229314584, + "loss": 0.84096754, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.27709961, + "step": 3519, + "time_per_iteration": 3.1552722454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069226, + "balance_loss_mlp": 1.04195142, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.06591388650746736, + "language_loss": 0.87507355, + "learning_rate": 0.00024928086928218947, + "loss": 0.88576579, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.27307129, + "step": 3520, + "time_per_iteration": 3.176281452178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073411, + "balance_loss_mlp": 1.04553986, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.06204053550598198, + "language_loss": 0.76553816, + "learning_rate": 0.00024901137360903216, + "loss": 0.7762723, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.27905273, + "step": 3521, + "time_per_iteration": 3.2491977214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075413, + "balance_loss_mlp": 1.04773283, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.06068405228401802, + "language_loss": 0.80714798, + "learning_rate": 0.00024874197537830115, + "loss": 0.81790209, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.27734375, + "step": 3522, + "time_per_iteration": 3.2800705432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069929, + "balance_loss_mlp": 1.04258251, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.0705299171766763, + "language_loss": 0.83310688, + "learning_rate": 0.00024847267469458684, + "loss": 0.84380615, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.27392578, + "step": 3523, + "time_per_iteration": 3.044410228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072093, + "balance_loss_mlp": 1.04400754, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.05514098679922032, + "language_loss": 0.77547973, + "learning_rate": 0.00024820347166244034, + "loss": 0.78620064, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.28100586, + "step": 3524, + "time_per_iteration": 3.3789007663726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074799, + "balance_loss_mlp": 1.04697526, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.05352508807919392, + "language_loss": 0.84795761, + "learning_rate": 0.0002479343663863755, + "loss": 0.85870552, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.27856445, + "step": 3525, + "time_per_iteration": 3.242717742919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04462886, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.06320153638070183, + "language_loss": 0.76689994, + "learning_rate": 0.00024766535897086876, + "loss": 0.77762568, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.27929688, + "step": 3526, + "time_per_iteration": 3.28702712059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107187, + "balance_loss_mlp": 1.04366529, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.06947465366955115, + "language_loss": 0.79284716, + "learning_rate": 0.0002473964495203578, + "loss": 0.80356586, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.28222656, + "step": 3527, + "time_per_iteration": 3.2413079738616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107552, + "balance_loss_mlp": 1.0474577, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.05313281252101078, + "language_loss": 0.8542428, + "learning_rate": 0.0002471276381392425, + "loss": 0.86499798, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.28076172, + "step": 3528, + "time_per_iteration": 3.3680808544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_mlp": 1.02044225, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.015931191486776266, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79221857, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.12792969, + "step": 3529, + "time_per_iteration": 5.628952741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069556, + "balance_loss_mlp": 1.04094601, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.06736468086197074, + "language_loss": 0.84283829, + "learning_rate": 0.00024659031000260826, + "loss": 0.85353386, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.28588867, + "step": 3530, + "time_per_iteration": 2.8723843097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.04080772, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.0688001707056691, + "language_loss": 0.80604416, + "learning_rate": 0.0002463217934556985, + "loss": 0.81674021, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.28808594, + "step": 3531, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_mlp": 1.01316202, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.012819798724274224, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77557838, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12597656, + "step": 3532, + "time_per_iteration": 4.774993181228638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069098, + "balance_loss_mlp": 1.04098845, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.07494627627994242, + "language_loss": 0.83949304, + "learning_rate": 0.0002457850559259306, + "loss": 0.85018402, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.28125, + "step": 3533, + "time_per_iteration": 2.854862928390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069128, + "balance_loss_mlp": 1.04123271, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05955036314433414, + "language_loss": 0.81432045, + "learning_rate": 0.00024551683515145275, + "loss": 0.82501173, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.27905273, + "step": 3534, + "time_per_iteration": 2.662670612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068932, + "balance_loss_mlp": 1.04084659, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.05698546166287553, + "language_loss": 0.86435509, + "learning_rate": 0.0002452487131761014, + "loss": 0.87504447, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.28125, + "step": 3535, + "time_per_iteration": 2.7052507400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068803, + "balance_loss_mlp": 1.0406456, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.2007355544417899, + "language_loss": 0.79636157, + "learning_rate": 0.00024498069010397093, + "loss": 0.80704963, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.28198242, + "step": 3536, + "time_per_iteration": 2.6741490364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073159, + "balance_loss_mlp": 1.04452467, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.06175774783534356, + "language_loss": 0.85386938, + "learning_rate": 0.00024471276603911697, + "loss": 0.86460102, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.28613281, + "step": 3537, + "time_per_iteration": 2.582512378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_mlp": 1.04049325, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.05665258990060116, + "language_loss": 0.79265833, + "learning_rate": 0.0002444449410855572, + "loss": 0.80335104, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.28759766, + "step": 3538, + "time_per_iteration": 2.7172720432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075887, + "balance_loss_mlp": 1.04689479, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.04143612880488866, + "language_loss": 0.84057069, + "learning_rate": 0.00024417721534727033, + "loss": 0.85132951, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.29003906, + "step": 3539, + "time_per_iteration": 2.6684606075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072025, + "balance_loss_mlp": 1.04322374, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.07425691047539493, + "language_loss": 0.82827783, + "learning_rate": 0.00024390958892819687, + "loss": 0.83899808, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.28759766, + "step": 3540, + "time_per_iteration": 2.4658186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107288, + "balance_loss_mlp": 1.04481781, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.05780068585896815, + "language_loss": 0.80981314, + "learning_rate": 0.0002436420619322381, + "loss": 0.82054192, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.28100586, + "step": 3541, + "time_per_iteration": 2.8231966495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077487, + "balance_loss_mlp": 1.04835224, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.05333594930296874, + "language_loss": 0.82771194, + "learning_rate": 0.0002433746344632577, + "loss": 0.83848679, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.29101562, + "step": 3542, + "time_per_iteration": 2.6959166526794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071587, + "balance_loss_mlp": 1.04259515, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.224573626709811, + "language_loss": 0.80137914, + "learning_rate": 0.00024310730662508006, + "loss": 0.81209499, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.28955078, + "step": 3543, + "time_per_iteration": 3.0683388710021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075151, + "balance_loss_mlp": 1.04639745, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05641923702729484, + "language_loss": 0.87227619, + "learning_rate": 0.0002428400785214911, + "loss": 0.88302767, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.28759766, + "step": 3544, + "time_per_iteration": 2.602978467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075917, + "balance_loss_mlp": 1.04830861, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.05415791739342902, + "language_loss": 0.82201838, + "learning_rate": 0.00024257295025623794, + "loss": 0.83277762, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.27636719, + "step": 3545, + "time_per_iteration": 2.8973493576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079854, + "balance_loss_mlp": 1.05074358, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.05879535961793021, + "language_loss": 0.8075946, + "learning_rate": 0.00024230592193302892, + "loss": 0.81839317, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.29077148, + "step": 3546, + "time_per_iteration": 2.8674380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079529, + "balance_loss_mlp": 1.0514431, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.05930658835110869, + "language_loss": 0.84390098, + "learning_rate": 0.00024203899365553372, + "loss": 0.85469627, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.28100586, + "step": 3547, + "time_per_iteration": 2.570162773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_mlp": 1.03785849, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.024142362504210636, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7778371, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11474609, + "step": 3548, + "time_per_iteration": 4.54862117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.0492295, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05396480474730288, + "language_loss": 0.82952201, + "learning_rate": 0.00024150543765216848, + "loss": 0.84029901, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.28442383, + "step": 3549, + "time_per_iteration": 2.8922061920166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081348, + "balance_loss_mlp": 1.05261874, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.08705135979463063, + "language_loss": 0.83172846, + "learning_rate": 0.00024123881013344352, + "loss": 0.84254193, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.28735352, + "step": 3550, + "time_per_iteration": 2.674441337585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081968, + "balance_loss_mlp": 1.05381048, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.052816648102186906, + "language_loss": 0.79533482, + "learning_rate": 0.00024097228307472202, + "loss": 0.80615449, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.28173828, + "step": 3551, + "time_per_iteration": 2.810211181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108367, + "balance_loss_mlp": 1.0561564, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.06537057112409075, + "language_loss": 0.82174456, + "learning_rate": 0.00024070585657947846, + "loss": 0.83258128, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.27563477, + "step": 3552, + "time_per_iteration": 2.903355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_mlp": 1.05487537, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.04571103673496298, + "language_loss": 0.85090339, + "learning_rate": 0.00024043953075114934, + "loss": 0.86174351, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.29150391, + "step": 3553, + "time_per_iteration": 2.683868169784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085174, + "balance_loss_mlp": 1.05711174, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06261928817671675, + "language_loss": 0.88604438, + "learning_rate": 0.00024017330569313128, + "loss": 0.89689612, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.28051758, + "step": 3554, + "time_per_iteration": 2.7235445976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085006, + "balance_loss_mlp": 1.05611026, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05900054168258606, + "language_loss": 0.74906945, + "learning_rate": 0.0002399071815087821, + "loss": 0.75991952, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.28857422, + "step": 3555, + "time_per_iteration": 3.0646519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085121, + "balance_loss_mlp": 1.05579519, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.06151916899658477, + "language_loss": 0.84067833, + "learning_rate": 0.00023964115830142025, + "loss": 0.85152954, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.29321289, + "step": 3556, + "time_per_iteration": 2.670454740524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086273, + "balance_loss_mlp": 1.05785322, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.07044194962998349, + "language_loss": 0.87372839, + "learning_rate": 0.00023937523617432522, + "loss": 0.8845911, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.28393555, + "step": 3557, + "time_per_iteration": 2.442620038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079062, + "balance_loss_mlp": 1.05073762, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.11887051887526623, + "language_loss": 0.86776745, + "learning_rate": 0.00023910941523073705, + "loss": 0.8785581, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.28320312, + "step": 3558, + "time_per_iteration": 3.9105570316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080627, + "balance_loss_mlp": 1.05211186, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05794224336416494, + "language_loss": 0.86635411, + "learning_rate": 0.0002388436955738566, + "loss": 0.87716037, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.28540039, + "step": 3559, + "time_per_iteration": 2.7885656356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010825, + "balance_loss_mlp": 1.05310321, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.06653025521174674, + "language_loss": 0.81589997, + "learning_rate": 0.00023857807730684523, + "loss": 0.82672501, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.29394531, + "step": 3560, + "time_per_iteration": 2.8988590240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082565, + "balance_loss_mlp": 1.05378819, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.07668578233950803, + "language_loss": 0.82023144, + "learning_rate": 0.00023831256053282547, + "loss": 0.83105713, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.2878418, + "step": 3561, + "time_per_iteration": 2.644080877304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_mlp": 1.05380273, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.07104594234153103, + "language_loss": 0.78454512, + "learning_rate": 0.00023804714535488003, + "loss": 0.79537451, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.29150391, + "step": 3562, + "time_per_iteration": 2.8966143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_mlp": 1.03124619, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.023182514695526305, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80852556, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.11669922, + "step": 3563, + "time_per_iteration": 4.932991027832031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078302, + "balance_loss_mlp": 1.04947758, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05956770996074772, + "language_loss": 0.8101843, + "learning_rate": 0.00023751662019934488, + "loss": 0.82096732, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2878418, + "step": 3564, + "time_per_iteration": 2.49049711227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080425, + "balance_loss_mlp": 1.05214906, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.05086931810535688, + "language_loss": 0.78869629, + "learning_rate": 0.00023725151042772364, + "loss": 0.79950058, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.28271484, + "step": 3565, + "time_per_iteration": 2.7470548152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079752, + "balance_loss_mlp": 1.04959226, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.07206608311036458, + "language_loss": 0.83451784, + "learning_rate": 0.00023698650266411276, + "loss": 0.8453154, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.30102539, + "step": 3566, + "time_per_iteration": 2.6310577392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079469, + "balance_loss_mlp": 1.04949975, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05434580355598899, + "language_loss": 0.83292013, + "learning_rate": 0.00023672159701139755, + "loss": 0.84371483, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.29931641, + "step": 3567, + "time_per_iteration": 3.2131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081005, + "balance_loss_mlp": 1.05160773, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.11905493017863943, + "language_loss": 0.8579241, + "learning_rate": 0.00023645679357242296, + "loss": 0.86873412, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.29370117, + "step": 3568, + "time_per_iteration": 2.536799192428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079259, + "balance_loss_mlp": 1.04881263, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.0572051056650869, + "language_loss": 0.83415657, + "learning_rate": 0.00023619209244999534, + "loss": 0.84494913, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.30395508, + "step": 3569, + "time_per_iteration": 2.6000583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071372, + "balance_loss_mlp": 1.0414027, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.07852810593031194, + "language_loss": 0.84651816, + "learning_rate": 0.0002359274937468806, + "loss": 0.85723186, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.29931641, + "step": 3570, + "time_per_iteration": 2.57413387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_mlp": 1.04479098, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.05388106388486604, + "language_loss": 0.77385354, + "learning_rate": 0.00023566299756580512, + "loss": 0.78460878, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.30688477, + "step": 3571, + "time_per_iteration": 2.6366066932678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.04491949, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.07115585873088184, + "language_loss": 0.78295314, + "learning_rate": 0.0002353986040094551, + "loss": 0.79371446, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.31176758, + "step": 3572, + "time_per_iteration": 2.503833532333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070084, + "balance_loss_mlp": 1.03882694, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.06984885351733894, + "language_loss": 0.79368085, + "learning_rate": 0.00023513431318047796, + "loss": 0.80438167, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.31225586, + "step": 3573, + "time_per_iteration": 2.568976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107429, + "balance_loss_mlp": 1.04293847, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.060417226210131056, + "language_loss": 0.76676512, + "learning_rate": 0.00023487012518147977, + "loss": 0.77750802, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.31323242, + "step": 3574, + "time_per_iteration": 3.229848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069454, + "balance_loss_mlp": 1.03836417, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.06028735388663287, + "language_loss": 0.84485316, + "learning_rate": 0.00023460604011502772, + "loss": 0.85554767, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.31054688, + "step": 3575, + "time_per_iteration": 3.6276612281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_mlp": 1.03640747, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.059284706265635014, + "language_loss": 0.85573983, + "learning_rate": 0.00023434205808364845, + "loss": 0.8664217, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.31762695, + "step": 3576, + "time_per_iteration": 3.154609203338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073627, + "balance_loss_mlp": 1.04146445, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.06862311945477588, + "language_loss": 0.85635597, + "learning_rate": 0.00023407817918982932, + "loss": 0.86709225, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.3215332, + "step": 3577, + "time_per_iteration": 2.770382881164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065226, + "balance_loss_mlp": 1.03480327, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05501523594648703, + "language_loss": 0.78652638, + "learning_rate": 0.00023381440353601718, + "loss": 0.79717863, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.30371094, + "step": 3578, + "time_per_iteration": 3.0038936138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068554, + "balance_loss_mlp": 1.03674912, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.07314782332090318, + "language_loss": 0.85671222, + "learning_rate": 0.00023355073122461822, + "loss": 0.86739773, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.31787109, + "step": 3579, + "time_per_iteration": 2.901097059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068864, + "balance_loss_mlp": 1.03798902, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05988205540841198, + "language_loss": 0.82838941, + "learning_rate": 0.00023328716235799973, + "loss": 0.83907801, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.30834961, + "step": 3580, + "time_per_iteration": 3.3144712448120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03734803, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.05209228569629584, + "language_loss": 0.83578706, + "learning_rate": 0.00023302369703848803, + "loss": 0.84647214, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.3112793, + "step": 3581, + "time_per_iteration": 2.7352983951568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072888, + "balance_loss_mlp": 1.04153562, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.06738914955836864, + "language_loss": 0.80107218, + "learning_rate": 0.00023276033536836937, + "loss": 0.81180108, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.31323242, + "step": 3582, + "time_per_iteration": 2.8315579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069685, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.07822330365866909, + "language_loss": 0.84485823, + "learning_rate": 0.00023249707744988984, + "loss": 0.85555506, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.31176758, + "step": 3583, + "time_per_iteration": 2.6693801879882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_mlp": 1.03927565, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.09035135761218806, + "language_loss": 0.82157326, + "learning_rate": 0.00023223392338525529, + "loss": 0.83227813, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.31176758, + "step": 3584, + "time_per_iteration": 2.6018331050872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071087, + "balance_loss_mlp": 1.03997374, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.07744993578546541, + "language_loss": 0.78292501, + "learning_rate": 0.00023197087327663107, + "loss": 0.79363585, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.31079102, + "step": 3585, + "time_per_iteration": 2.6550607681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073164, + "balance_loss_mlp": 1.04259896, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.06125478015545225, + "language_loss": 0.80901551, + "learning_rate": 0.00023170792722614243, + "loss": 0.81974715, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.30541992, + "step": 3586, + "time_per_iteration": 2.9460513591766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071475, + "balance_loss_mlp": 1.04057574, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05047941445610664, + "language_loss": 0.83664584, + "learning_rate": 0.00023144508533587377, + "loss": 0.84736061, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.30859375, + "step": 3587, + "time_per_iteration": 2.856055498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.04320216, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.06477764746614291, + "language_loss": 0.78527439, + "learning_rate": 0.0002311823477078698, + "loss": 0.796013, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.30615234, + "step": 3588, + "time_per_iteration": 3.003086805343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.04569197, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.08587382139418309, + "language_loss": 0.8476119, + "learning_rate": 0.00023091971444413428, + "loss": 0.85837138, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.30224609, + "step": 3589, + "time_per_iteration": 2.81282114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080015, + "balance_loss_mlp": 1.04909205, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.06247314370450002, + "language_loss": 0.82250512, + "learning_rate": 0.00023065718564663012, + "loss": 0.83330524, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.30883789, + "step": 3590, + "time_per_iteration": 2.7536580562591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_mlp": 1.02656031, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.017663884765429294, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74949831, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11669922, + "step": 3591, + "time_per_iteration": 4.997710704803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076225, + "balance_loss_mlp": 1.04732895, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.06051074258589463, + "language_loss": 0.80712819, + "learning_rate": 0.0002301324418579666, + "loss": 0.81789041, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.28881836, + "step": 3592, + "time_per_iteration": 2.6742522716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_mlp": 1.02309299, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.018187638305653092, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79723203, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.11621094, + "step": 3593, + "time_per_iteration": 4.769122123718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077785, + "balance_loss_mlp": 1.04865015, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.06768771188848043, + "language_loss": 0.80975646, + "learning_rate": 0.00022960811715677415, + "loss": 0.82053435, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.29101562, + "step": 3594, + "time_per_iteration": 2.8826262950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073934, + "balance_loss_mlp": 1.04472804, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.06319085560184597, + "language_loss": 0.81575662, + "learning_rate": 0.00022934611221845608, + "loss": 0.82649601, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.29150391, + "step": 3595, + "time_per_iteration": 2.8295226097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076251, + "balance_loss_mlp": 1.04663992, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.06812021191327418, + "language_loss": 0.7816391, + "learning_rate": 0.00022908421235729609, + "loss": 0.79240167, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.29589844, + "step": 3596, + "time_per_iteration": 2.6967883110046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072978, + "balance_loss_mlp": 1.04343832, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.05588162703096273, + "language_loss": 0.85190284, + "learning_rate": 0.0002288224176749728, + "loss": 0.86263263, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.29492188, + "step": 3597, + "time_per_iteration": 2.640408515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076769, + "balance_loss_mlp": 1.04775333, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0641823490668264, + "language_loss": 0.78313982, + "learning_rate": 0.00022856072827312385, + "loss": 0.79390752, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.28979492, + "step": 3598, + "time_per_iteration": 2.840587854385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.0432148, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.07324523845521881, + "language_loss": 0.76861233, + "learning_rate": 0.00022829914425334598, + "loss": 0.77933681, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.29223633, + "step": 3599, + "time_per_iteration": 2.6705574989318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068561, + "balance_loss_mlp": 1.03871107, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.06707330247170458, + "language_loss": 0.80270433, + "learning_rate": 0.0002280376657171956, + "loss": 0.8133899, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.2980957, + "step": 3600, + "time_per_iteration": 2.691218852996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070739, + "balance_loss_mlp": 1.04091287, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05961595039117338, + "language_loss": 0.76559889, + "learning_rate": 0.00022777629276618706, + "loss": 0.77630627, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.2980957, + "step": 3601, + "time_per_iteration": 3.166266679763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073223, + "balance_loss_mlp": 1.0433017, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05590734740319096, + "language_loss": 0.7759192, + "learning_rate": 0.0002275150255017947, + "loss": 0.78665143, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.29882812, + "step": 3602, + "time_per_iteration": 2.8251349925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018234, + "balance_loss_mlp": 1.00593138, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.021195340578823645, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76750904, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.12304688, + "step": 3603, + "time_per_iteration": 4.9793617725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015265, + "balance_loss_mlp": 1.00286758, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.02110962500083285, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76142371, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.12353516, + "step": 3604, + "time_per_iteration": 4.700538873672485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072674, + "balance_loss_mlp": 1.04251432, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.0788112373404933, + "language_loss": 0.8439424, + "learning_rate": 0.0002267318588424379, + "loss": 0.85466921, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.30151367, + "step": 3605, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067214, + "balance_loss_mlp": 1.03688753, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.060784014113104926, + "language_loss": 0.87543291, + "learning_rate": 0.00022647101533842845, + "loss": 0.88610506, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.30297852, + "step": 3606, + "time_per_iteration": 2.8924877643585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04255819, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.06196096561897257, + "language_loss": 0.76276547, + "learning_rate": 0.00022621027802778872, + "loss": 0.77349472, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.30322266, + "step": 3607, + "time_per_iteration": 2.625544309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064019, + "balance_loss_mlp": 1.03402638, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.05568531242453984, + "language_loss": 0.78539741, + "learning_rate": 0.00022594964701174586, + "loss": 0.79603761, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.29956055, + "step": 3608, + "time_per_iteration": 2.617882490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.04363918, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.06276821144872391, + "language_loss": 0.84534574, + "learning_rate": 0.00022568912239148586, + "loss": 0.8560816, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.29882812, + "step": 3609, + "time_per_iteration": 2.6177947521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068336, + "balance_loss_mlp": 1.03836668, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.056081647762310796, + "language_loss": 0.81555855, + "learning_rate": 0.00022542870426815344, + "loss": 0.82624191, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.29907227, + "step": 3610, + "time_per_iteration": 2.7079262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065817, + "balance_loss_mlp": 1.03646755, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.0593152321810988, + "language_loss": 0.85921854, + "learning_rate": 0.00022516839274285173, + "loss": 0.86987674, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.29321289, + "step": 3611, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068225, + "balance_loss_mlp": 1.03689671, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.07495855617451591, + "language_loss": 0.75130123, + "learning_rate": 0.00022490818791664265, + "loss": 0.76198351, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.31298828, + "step": 3612, + "time_per_iteration": 2.6149849891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067927, + "balance_loss_mlp": 1.03771973, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.05072032327743767, + "language_loss": 0.85225737, + "learning_rate": 0.00022464808989054676, + "loss": 0.86293662, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.30151367, + "step": 3613, + "time_per_iteration": 2.6458423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062852, + "balance_loss_mlp": 1.03331208, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.07224132209133893, + "language_loss": 0.76020145, + "learning_rate": 0.00022438809876554284, + "loss": 0.77082992, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.29516602, + "step": 3614, + "time_per_iteration": 2.6633236408233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106639, + "balance_loss_mlp": 1.03720808, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05675110425477687, + "language_loss": 0.80015868, + "learning_rate": 0.00022412821464256873, + "loss": 0.81082261, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.29174805, + "step": 3615, + "time_per_iteration": 2.726789712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063431, + "balance_loss_mlp": 1.03396273, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.06271109335257424, + "language_loss": 0.82397133, + "learning_rate": 0.00022386843762252023, + "loss": 0.83460569, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.29418945, + "step": 3616, + "time_per_iteration": 2.6123175621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106886, + "balance_loss_mlp": 1.03781807, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.06387852157141136, + "language_loss": 0.79405069, + "learning_rate": 0.00022360876780625193, + "loss": 0.8047393, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.31030273, + "step": 3617, + "time_per_iteration": 2.548015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_mlp": 1.03798532, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.0476690799196669, + "language_loss": 0.7988438, + "learning_rate": 0.00022334920529457604, + "loss": 0.80952054, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.296875, + "step": 3618, + "time_per_iteration": 2.899250030517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.0357945, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.054798101167174096, + "language_loss": 0.87429041, + "learning_rate": 0.00022308975018826423, + "loss": 0.88495374, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.30517578, + "step": 3619, + "time_per_iteration": 2.96332049369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04016924, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.06421164682139191, + "language_loss": 0.85025704, + "learning_rate": 0.00022283040258804564, + "loss": 0.86095744, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.29858398, + "step": 3620, + "time_per_iteration": 2.7818944454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067101, + "balance_loss_mlp": 1.03703606, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.06644285191513807, + "language_loss": 0.83246511, + "learning_rate": 0.00022257116259460802, + "loss": 0.84313607, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.30004883, + "step": 3621, + "time_per_iteration": 2.870532989501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068386, + "balance_loss_mlp": 1.03901291, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.06921875901681852, + "language_loss": 0.81326395, + "learning_rate": 0.00022231203030859725, + "loss": 0.82394779, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.29321289, + "step": 3622, + "time_per_iteration": 2.980616807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069183, + "balance_loss_mlp": 1.03923714, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06079999883636956, + "language_loss": 0.83173907, + "learning_rate": 0.00022205300583061737, + "loss": 0.84243095, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.29882812, + "step": 3623, + "time_per_iteration": 2.579345226287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_mlp": 1.02855718, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.01990235236243219, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83878684, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.11914062, + "step": 3624, + "time_per_iteration": 4.92698335647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106745, + "balance_loss_mlp": 1.03705204, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.06709425474580019, + "language_loss": 0.77051836, + "learning_rate": 0.00022153528070095735, + "loss": 0.7811929, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.3034668, + "step": 3625, + "time_per_iteration": 2.732236385345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072165, + "balance_loss_mlp": 1.04262519, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.06819853082306866, + "language_loss": 0.88156587, + "learning_rate": 0.00022127658025027568, + "loss": 0.89228755, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.29516602, + "step": 3626, + "time_per_iteration": 2.6894659996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_mlp": 1.04275477, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.06462671043275556, + "language_loss": 0.84997016, + "learning_rate": 0.00022101798800962258, + "loss": 0.8606984, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.30004883, + "step": 3627, + "time_per_iteration": 2.578765392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067981, + "balance_loss_mlp": 1.03732049, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.07388726632037217, + "language_loss": 0.7899543, + "learning_rate": 0.00022075950407939227, + "loss": 0.80063409, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.30639648, + "step": 3628, + "time_per_iteration": 2.615227699279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.04519582, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.07136749331855524, + "language_loss": 0.82724559, + "learning_rate": 0.0002205011285599367, + "loss": 0.83798957, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.29150391, + "step": 3629, + "time_per_iteration": 2.623537063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068631, + "balance_loss_mlp": 1.0383997, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.053682643938984226, + "language_loss": 0.80428958, + "learning_rate": 0.00022024286155156658, + "loss": 0.81497598, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.30224609, + "step": 3630, + "time_per_iteration": 2.8577961921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074555, + "balance_loss_mlp": 1.04472852, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.05341661710184385, + "language_loss": 0.85616398, + "learning_rate": 0.00021998470315454994, + "loss": 0.8669095, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.2980957, + "step": 3631, + "time_per_iteration": 2.6452653408050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.03902662, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.06182978984642289, + "language_loss": 0.86509019, + "learning_rate": 0.00021972665346911275, + "loss": 0.87577331, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.29296875, + "step": 3632, + "time_per_iteration": 2.7207632064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073072, + "balance_loss_mlp": 1.04400849, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05617398494873169, + "language_loss": 0.79707497, + "learning_rate": 0.00021946871259543877, + "loss": 0.80780566, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.29052734, + "step": 3633, + "time_per_iteration": 2.574397325515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073801, + "balance_loss_mlp": 1.04488051, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05654795894092567, + "language_loss": 0.83115089, + "learning_rate": 0.00021921088063366957, + "loss": 0.8418889, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.28930664, + "step": 3634, + "time_per_iteration": 2.9441816806793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.04452109, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05955924970323312, + "language_loss": 0.8162455, + "learning_rate": 0.00021895315768390435, + "loss": 0.82697725, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.28662109, + "step": 3635, + "time_per_iteration": 2.62445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04932475, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.054016227636185014, + "language_loss": 0.88036686, + "learning_rate": 0.00021869554384619999, + "loss": 0.89114523, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.28491211, + "step": 3636, + "time_per_iteration": 3.0029518604278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107865, + "balance_loss_mlp": 1.05037308, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.06391776997203466, + "language_loss": 0.80659258, + "learning_rate": 0.00021843803922057115, + "loss": 0.81737912, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.28295898, + "step": 3637, + "time_per_iteration": 2.7211790084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107883, + "balance_loss_mlp": 1.05110145, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.0662212795858457, + "language_loss": 0.81642038, + "learning_rate": 0.00021818064390698977, + "loss": 0.82720864, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.27758789, + "step": 3638, + "time_per_iteration": 2.5884149074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081934, + "balance_loss_mlp": 1.05303788, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.06374773426861974, + "language_loss": 0.86868232, + "learning_rate": 0.0002179233580053861, + "loss": 0.8795017, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.2890625, + "step": 3639, + "time_per_iteration": 2.753732681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076492, + "balance_loss_mlp": 1.04776227, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.059265612347706345, + "language_loss": 0.85829276, + "learning_rate": 0.00021766618161564688, + "loss": 0.86905766, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.28710938, + "step": 3640, + "time_per_iteration": 2.7745206356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.05575871, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.15690200420977896, + "language_loss": 0.87115562, + "learning_rate": 0.00021740911483761677, + "loss": 0.88199788, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.28417969, + "step": 3641, + "time_per_iteration": 2.563645362854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080559, + "balance_loss_mlp": 1.05292678, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.051778810892446146, + "language_loss": 0.92034602, + "learning_rate": 0.00021715215777109837, + "loss": 0.93115163, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.27685547, + "step": 3642, + "time_per_iteration": 2.9448609352111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082689, + "balance_loss_mlp": 1.05481815, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.0649670876424198, + "language_loss": 0.84332794, + "learning_rate": 0.00021689531051585103, + "loss": 0.85415483, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.27905273, + "step": 3643, + "time_per_iteration": 2.5947420597076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05185759, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.05881899099988506, + "language_loss": 0.80633974, + "learning_rate": 0.00021663857317159196, + "loss": 0.81714302, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.28466797, + "step": 3644, + "time_per_iteration": 2.6077582836151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.0568645, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05176536936587348, + "language_loss": 0.81858003, + "learning_rate": 0.00021638194583799487, + "loss": 0.82942665, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.27832031, + "step": 3645, + "time_per_iteration": 2.661813735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081277, + "balance_loss_mlp": 1.05335796, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.06125341159179279, + "language_loss": 0.82837009, + "learning_rate": 0.00021612542861469176, + "loss": 0.83918285, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.27954102, + "step": 3646, + "time_per_iteration": 3.218862771987915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086908, + "balance_loss_mlp": 1.05860782, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.06205257588419687, + "language_loss": 0.82430637, + "learning_rate": 0.00021586902160127135, + "loss": 0.83517551, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.28271484, + "step": 3647, + "time_per_iteration": 2.5945966243743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087682, + "balance_loss_mlp": 1.05938208, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07384041678105348, + "language_loss": 0.74226022, + "learning_rate": 0.00021561272489727974, + "loss": 0.75313699, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.28320312, + "step": 3648, + "time_per_iteration": 2.423347234725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.06241107, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.0540045704658738, + "language_loss": 0.80522048, + "learning_rate": 0.0002153565386022199, + "loss": 0.8161214, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.27734375, + "step": 3649, + "time_per_iteration": 2.634904623031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089135, + "balance_loss_mlp": 1.06112039, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.1599503630973746, + "language_loss": 0.8250525, + "learning_rate": 0.00021510046281555262, + "loss": 0.83594382, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.28027344, + "step": 3650, + "time_per_iteration": 2.824385643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087214, + "balance_loss_mlp": 1.05922353, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.06982952600277435, + "language_loss": 0.81099337, + "learning_rate": 0.0002148444976366949, + "loss": 0.82186544, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.27978516, + "step": 3651, + "time_per_iteration": 2.7480077743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06297851, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06340286287585739, + "language_loss": 0.82626015, + "learning_rate": 0.00021458864316502136, + "loss": 0.83716673, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.27734375, + "step": 3652, + "time_per_iteration": 2.699397087097168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085576, + "balance_loss_mlp": 1.0581578, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.06356802688225487, + "language_loss": 0.87087834, + "learning_rate": 0.0002143328994998634, + "loss": 0.88173407, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.2746582, + "step": 3653, + "time_per_iteration": 2.4910500049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108223, + "balance_loss_mlp": 1.05347681, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.1133092603860293, + "language_loss": 0.78451055, + "learning_rate": 0.00021407726674050982, + "loss": 0.79533285, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.28735352, + "step": 3654, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.0578599, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.054147023301355804, + "language_loss": 0.86789209, + "learning_rate": 0.0002138217449862061, + "loss": 0.87875628, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.28540039, + "step": 3655, + "time_per_iteration": 2.7385337352752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108677, + "balance_loss_mlp": 1.05932784, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.06738898601128132, + "language_loss": 0.78017962, + "learning_rate": 0.00021356633433615403, + "loss": 0.79104733, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.2746582, + "step": 3656, + "time_per_iteration": 2.5828328132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086039, + "balance_loss_mlp": 1.05778599, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.05385272242156959, + "language_loss": 0.83434522, + "learning_rate": 0.0002133110348895133, + "loss": 0.84520566, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.28271484, + "step": 3657, + "time_per_iteration": 2.978156805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081393, + "balance_loss_mlp": 1.05316448, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.05837559854624073, + "language_loss": 0.84898746, + "learning_rate": 0.0002130558467453999, + "loss": 0.85980141, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.28198242, + "step": 3658, + "time_per_iteration": 3.3442087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087911, + "balance_loss_mlp": 1.05875289, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.19942638133943547, + "language_loss": 0.84606349, + "learning_rate": 0.0002128007700028865, + "loss": 0.85694265, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.29125977, + "step": 3659, + "time_per_iteration": 2.742828607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088765, + "balance_loss_mlp": 1.06072712, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.06314927243304276, + "language_loss": 0.84402716, + "learning_rate": 0.00021254580476100276, + "loss": 0.85491478, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.28051758, + "step": 3660, + "time_per_iteration": 2.565272569656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087079, + "balance_loss_mlp": 1.0595659, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.06296941062799823, + "language_loss": 0.78639442, + "learning_rate": 0.00021229095111873497, + "loss": 0.79726517, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.27539062, + "step": 3661, + "time_per_iteration": 2.842556953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088789, + "balance_loss_mlp": 1.06072736, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.05444300541547984, + "language_loss": 0.86236918, + "learning_rate": 0.0002120362091750261, + "loss": 0.87325704, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.28100586, + "step": 3662, + "time_per_iteration": 2.810499668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05518591, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.0593931077751887, + "language_loss": 0.86978149, + "learning_rate": 0.00021178157902877566, + "loss": 0.88061064, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.27758789, + "step": 3663, + "time_per_iteration": 2.4574224948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092262, + "balance_loss_mlp": 1.06415284, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.0751363020635885, + "language_loss": 0.86745709, + "learning_rate": 0.0002115270607788397, + "loss": 0.87837976, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.28125, + "step": 3664, + "time_per_iteration": 2.7495899200439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087732, + "balance_loss_mlp": 1.05981338, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.07034018625942835, + "language_loss": 0.85685182, + "learning_rate": 0.00021127265452403133, + "loss": 0.86772919, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.27954102, + "step": 3665, + "time_per_iteration": 2.5029428005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_mlp": 1.03472269, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.01645523461712921, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85138083, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.1171875, + "step": 3666, + "time_per_iteration": 4.882653474807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.05729461, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.05492799595906871, + "language_loss": 0.82834661, + "learning_rate": 0.00021076417839483065, + "loss": 0.83919299, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27392578, + "step": 3667, + "time_per_iteration": 2.8046011924743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084673, + "balance_loss_mlp": 1.05622983, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.057239687513416834, + "language_loss": 0.84952044, + "learning_rate": 0.00021051010871784589, + "loss": 0.86036718, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.28442383, + "step": 3668, + "time_per_iteration": 2.547053098678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_mlp": 1.05634761, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.050223334888513216, + "language_loss": 0.78893518, + "learning_rate": 0.0002102561514308045, + "loss": 0.79978049, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.28173828, + "step": 3669, + "time_per_iteration": 2.752600908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081831, + "balance_loss_mlp": 1.05446088, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.06177474978046869, + "language_loss": 0.82231724, + "learning_rate": 0.00021000230663230135, + "loss": 0.8331356, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.27441406, + "step": 3670, + "time_per_iteration": 2.7295479774475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_mlp": 1.05213535, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.06597526409708185, + "language_loss": 0.82935393, + "learning_rate": 0.00020974857442088762, + "loss": 0.84015119, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.27612305, + "step": 3671, + "time_per_iteration": 2.6223764419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.05999768, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061832347037407955, + "language_loss": 0.88995802, + "learning_rate": 0.00020949495489507104, + "loss": 0.90083718, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.27954102, + "step": 3672, + "time_per_iteration": 2.6759605407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.0569576, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.08160392795168159, + "language_loss": 0.84611428, + "learning_rate": 0.00020924144815331525, + "loss": 0.85695612, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.27270508, + "step": 3673, + "time_per_iteration": 2.5533270835876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05991554, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.06771134911837604, + "language_loss": 0.8321439, + "learning_rate": 0.00020898805429404044, + "loss": 0.84301728, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.2746582, + "step": 3674, + "time_per_iteration": 2.6267168521881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086456, + "balance_loss_mlp": 1.05860853, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.074333129961205, + "language_loss": 0.78350407, + "learning_rate": 0.0002087347734156228, + "loss": 0.79436862, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.27880859, + "step": 3675, + "time_per_iteration": 2.879998207092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081334, + "balance_loss_mlp": 1.05415416, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.05100324832046891, + "language_loss": 0.79745239, + "learning_rate": 0.00020848160561639452, + "loss": 0.80826575, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.2722168, + "step": 3676, + "time_per_iteration": 2.6603164672851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084996, + "balance_loss_mlp": 1.05733955, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.054459225189570165, + "language_loss": 0.85905212, + "learning_rate": 0.0002082285509946445, + "loss": 0.86990213, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.27685547, + "step": 3677, + "time_per_iteration": 2.553056240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.05664098, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.062290106460759526, + "language_loss": 0.83324182, + "learning_rate": 0.00020797560964861683, + "loss": 0.84408498, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.27709961, + "step": 3678, + "time_per_iteration": 2.792145013809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087022, + "balance_loss_mlp": 1.05907917, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.06608494347958908, + "language_loss": 0.806409, + "learning_rate": 0.0002077227816765122, + "loss": 0.81727922, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.27954102, + "step": 3679, + "time_per_iteration": 4.414989709854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.03525627, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.01304969035368713, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77495277, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.12255859, + "step": 3680, + "time_per_iteration": 4.77666163444519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082723, + "balance_loss_mlp": 1.05544841, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.07037612396181211, + "language_loss": 0.7852788, + "learning_rate": 0.00020721746624665383, + "loss": 0.7961061, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.27319336, + "step": 3681, + "time_per_iteration": 2.7164971828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081164, + "balance_loss_mlp": 1.05338836, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.047491060798417466, + "language_loss": 0.80214369, + "learning_rate": 0.00020696497898508114, + "loss": 0.81295532, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.27807617, + "step": 3682, + "time_per_iteration": 3.0300755500793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.06165683, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.37225594130432843, + "language_loss": 0.77676904, + "learning_rate": 0.00020671260548979316, + "loss": 0.78766119, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.27587891, + "step": 3683, + "time_per_iteration": 3.0000338554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_mlp": 1.05715001, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.05966278900445413, + "language_loss": 0.84945965, + "learning_rate": 0.00020646034585876982, + "loss": 0.86030483, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.27441406, + "step": 3684, + "time_per_iteration": 2.8507392406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.05243671, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.050873107987967195, + "language_loss": 0.84335744, + "learning_rate": 0.00020620820018994718, + "loss": 0.85416293, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.28125, + "step": 3685, + "time_per_iteration": 2.8229713439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082628, + "balance_loss_mlp": 1.05385077, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.07162313361599233, + "language_loss": 0.82926023, + "learning_rate": 0.00020595616858121675, + "loss": 0.84008658, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2878418, + "step": 3686, + "time_per_iteration": 2.694638967514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079578, + "balance_loss_mlp": 1.05158722, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.06190114046391337, + "language_loss": 0.80535042, + "learning_rate": 0.00020570425113042586, + "loss": 0.81614614, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.28027344, + "step": 3687, + "time_per_iteration": 2.7041516304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.05074835, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.06733246833768769, + "language_loss": 0.85552853, + "learning_rate": 0.0002054524479353776, + "loss": 0.86632097, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.28540039, + "step": 3688, + "time_per_iteration": 2.6622695922851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079477, + "balance_loss_mlp": 1.05122447, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.09171480616774523, + "language_loss": 0.81669426, + "learning_rate": 0.00020520075909383063, + "loss": 0.82748902, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.28271484, + "step": 3689, + "time_per_iteration": 2.885802745819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085524, + "balance_loss_mlp": 1.05684257, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.058367776122323904, + "language_loss": 0.80585086, + "learning_rate": 0.00020494918470349916, + "loss": 0.81670618, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.28662109, + "step": 3690, + "time_per_iteration": 3.297044038772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078519, + "balance_loss_mlp": 1.05038536, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0682429606540151, + "language_loss": 0.85554057, + "learning_rate": 0.00020469772486205297, + "loss": 0.8663258, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.28149414, + "step": 3691, + "time_per_iteration": 2.602031707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082342, + "balance_loss_mlp": 1.05354142, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.05487079427914329, + "language_loss": 0.81415904, + "learning_rate": 0.0002044463796671177, + "loss": 0.82498252, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.2878418, + "step": 3692, + "time_per_iteration": 2.665280342102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086192, + "balance_loss_mlp": 1.05724823, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.06500857460791332, + "language_loss": 0.80369031, + "learning_rate": 0.00020419514921627408, + "loss": 0.81455219, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.28930664, + "step": 3693, + "time_per_iteration": 2.83823299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.05251122, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.05808556039270617, + "language_loss": 0.77408904, + "learning_rate": 0.00020394403360705855, + "loss": 0.78489405, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.2800293, + "step": 3694, + "time_per_iteration": 2.6939644813537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085807, + "balance_loss_mlp": 1.05569434, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.06287788377881579, + "language_loss": 0.87703514, + "learning_rate": 0.00020369303293696228, + "loss": 0.88789326, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.30078125, + "step": 3695, + "time_per_iteration": 2.588268995285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083208, + "balance_loss_mlp": 1.05474114, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.06448607356035771, + "language_loss": 0.78199911, + "learning_rate": 0.00020344214730343304, + "loss": 0.79283124, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.28466797, + "step": 3696, + "time_per_iteration": 2.6181139945983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073393, + "balance_loss_mlp": 1.04511678, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05437568169477665, + "language_loss": 0.79383552, + "learning_rate": 0.00020319137680387296, + "loss": 0.80456948, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.28271484, + "step": 3697, + "time_per_iteration": 2.925847291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077248, + "balance_loss_mlp": 1.04844677, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.07105325547979466, + "language_loss": 0.80237764, + "learning_rate": 0.0002029407215356398, + "loss": 0.81315017, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.28808594, + "step": 3698, + "time_per_iteration": 3.9760594367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077498, + "balance_loss_mlp": 1.04829144, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.06046542117195041, + "language_loss": 0.82863748, + "learning_rate": 0.00020269018159604663, + "loss": 0.83941245, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.29150391, + "step": 3699, + "time_per_iteration": 2.704861640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071741, + "balance_loss_mlp": 1.04336905, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.053095463302870675, + "language_loss": 0.818941, + "learning_rate": 0.00020243975708236162, + "loss": 0.82965839, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.28393555, + "step": 3700, + "time_per_iteration": 2.6019287109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010692, + "balance_loss_mlp": 1.0402801, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.06895358170102628, + "language_loss": 0.86096191, + "learning_rate": 0.00020218944809180818, + "loss": 0.87165391, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.2890625, + "step": 3701, + "time_per_iteration": 2.69789719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.0383426, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.048938239682891294, + "language_loss": 0.84783876, + "learning_rate": 0.00020193925472156493, + "loss": 0.85850537, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.28320312, + "step": 3702, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.04036713, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.026752885046143426, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75342035, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.125, + "step": 3703, + "time_per_iteration": 4.899750232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.0373385, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.05613195068078556, + "language_loss": 0.83530253, + "learning_rate": 0.00020143921523049863, + "loss": 0.84597135, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.29467773, + "step": 3704, + "time_per_iteration": 2.9570298194885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067522, + "balance_loss_mlp": 1.03860188, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.05853421015843179, + "language_loss": 0.83969504, + "learning_rate": 0.00020118936930380837, + "loss": 0.85037029, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.2890625, + "step": 3705, + "time_per_iteration": 2.750566005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068067, + "balance_loss_mlp": 1.03876543, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.07045372312262692, + "language_loss": 0.80809951, + "learning_rate": 0.0002009396393856932, + "loss": 0.81878018, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.29272461, + "step": 3706, + "time_per_iteration": 2.6755757331848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106429, + "balance_loss_mlp": 1.03560829, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.06196520847148758, + "language_loss": 0.82349885, + "learning_rate": 0.00020069002557310673, + "loss": 0.83414185, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.28662109, + "step": 3707, + "time_per_iteration": 2.737092971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_mlp": 1.03734505, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.06289073454443639, + "language_loss": 0.77148253, + "learning_rate": 0.00020044052796295807, + "loss": 0.78213924, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.28320312, + "step": 3708, + "time_per_iteration": 2.858578681945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066902, + "balance_loss_mlp": 1.03783917, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05709228954993964, + "language_loss": 0.8160665, + "learning_rate": 0.00020019114665211063, + "loss": 0.8267355, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.29052734, + "step": 3709, + "time_per_iteration": 2.6008872985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070493, + "balance_loss_mlp": 1.04128647, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.05827837383265674, + "language_loss": 0.81244481, + "learning_rate": 0.00019994188173738276, + "loss": 0.82314974, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.29174805, + "step": 3710, + "time_per_iteration": 2.6042001247406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068317, + "balance_loss_mlp": 1.03861022, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.056315014070009634, + "language_loss": 0.80933827, + "learning_rate": 0.0001996927333155477, + "loss": 0.82002145, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.29663086, + "step": 3711, + "time_per_iteration": 2.748624086380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010683, + "balance_loss_mlp": 1.03947508, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.061443099278046684, + "language_loss": 0.85405827, + "learning_rate": 0.00019944370148333346, + "loss": 0.86474121, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.2878418, + "step": 3712, + "time_per_iteration": 3.1557986736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.04316652, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.048833627959222234, + "language_loss": 0.79702485, + "learning_rate": 0.00019919478633742278, + "loss": 0.80774689, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.29052734, + "step": 3713, + "time_per_iteration": 2.667795419692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.04252636, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.0703082286681538, + "language_loss": 0.85178196, + "learning_rate": 0.00019894598797445302, + "loss": 0.86250067, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.29345703, + "step": 3714, + "time_per_iteration": 2.5345022678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107178, + "balance_loss_mlp": 1.04333699, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05625862990353456, + "language_loss": 0.8199116, + "learning_rate": 0.00019869730649101615, + "loss": 0.83062935, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.28417969, + "step": 3715, + "time_per_iteration": 2.8149824142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079135, + "balance_loss_mlp": 1.04988086, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.071816789410327, + "language_loss": 0.72405577, + "learning_rate": 0.00019844874198365943, + "loss": 0.73484713, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.29199219, + "step": 3716, + "time_per_iteration": 3.0852138996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069692, + "balance_loss_mlp": 1.04070067, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05756859715120925, + "language_loss": 0.83796489, + "learning_rate": 0.00019820029454888362, + "loss": 0.84866184, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.28979492, + "step": 3717, + "time_per_iteration": 2.7309763431549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_mlp": 1.01916921, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.017203742332568887, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75552928, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.125, + "step": 3718, + "time_per_iteration": 5.044423580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_mlp": 1.04777932, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.056277438983796696, + "language_loss": 0.79924434, + "learning_rate": 0.0001977037512828529, + "loss": 0.81001997, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.29760742, + "step": 3719, + "time_per_iteration": 2.5888805389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069135, + "balance_loss_mlp": 1.04059625, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.0550224121073684, + "language_loss": 0.86091673, + "learning_rate": 0.0001974556556443734, + "loss": 0.87160814, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.28540039, + "step": 3720, + "time_per_iteration": 2.7241830825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074341, + "balance_loss_mlp": 1.04575443, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.06173575943164377, + "language_loss": 0.88796955, + "learning_rate": 0.00019720767746402547, + "loss": 0.89871293, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.28564453, + "step": 3721, + "time_per_iteration": 2.721775770187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075436, + "balance_loss_mlp": 1.04725444, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.08488248506445442, + "language_loss": 0.79925454, + "learning_rate": 0.00019695981683808222, + "loss": 0.81000888, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.28173828, + "step": 3722, + "time_per_iteration": 2.7333226203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077529, + "balance_loss_mlp": 1.04989624, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.055390897958499746, + "language_loss": 0.85177088, + "learning_rate": 0.00019671207386277225, + "loss": 0.86254621, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.27636719, + "step": 3723, + "time_per_iteration": 2.924482583999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076762, + "balance_loss_mlp": 1.04800856, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06210467424192018, + "language_loss": 0.78391171, + "learning_rate": 0.0001964644486342777, + "loss": 0.79467928, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.28735352, + "step": 3724, + "time_per_iteration": 2.958444833755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04926085, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.0530875998345761, + "language_loss": 0.86440647, + "learning_rate": 0.00019621694124873524, + "loss": 0.87518001, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.28125, + "step": 3725, + "time_per_iteration": 2.6775362491607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_mlp": 1.02366674, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.0197496536520254, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77576053, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.12255859, + "step": 3726, + "time_per_iteration": 4.876794338226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079559, + "balance_loss_mlp": 1.05085373, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.05459811074333738, + "language_loss": 0.77077997, + "learning_rate": 0.00019572228039082428, + "loss": 0.78157556, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.28686523, + "step": 3727, + "time_per_iteration": 3.094959020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078104, + "balance_loss_mlp": 1.04982781, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.05087577266454216, + "language_loss": 0.83556503, + "learning_rate": 0.0001954751271105002, + "loss": 0.84634602, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.28295898, + "step": 3728, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077296, + "balance_loss_mlp": 1.04956806, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.058127871838067766, + "language_loss": 0.80794644, + "learning_rate": 0.00019522809205721687, + "loss": 0.81871945, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.27758789, + "step": 3729, + "time_per_iteration": 2.7567226886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070359, + "balance_loss_mlp": 1.0422256, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.06552906350513053, + "language_loss": 0.82629025, + "learning_rate": 0.0001949811753268816, + "loss": 0.83699387, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.28149414, + "step": 3730, + "time_per_iteration": 2.7015092372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074245, + "balance_loss_mlp": 1.04594445, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.0651237840260159, + "language_loss": 0.82088923, + "learning_rate": 0.00019473437701535634, + "loss": 0.83163166, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.28295898, + "step": 3731, + "time_per_iteration": 2.5865840911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04425454, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05867613657807477, + "language_loss": 0.89630008, + "learning_rate": 0.00019448769721845677, + "loss": 0.90702283, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.28051758, + "step": 3732, + "time_per_iteration": 2.800302743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073645, + "balance_loss_mlp": 1.04503512, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.07249060183275255, + "language_loss": 0.85536152, + "learning_rate": 0.00019424113603195203, + "loss": 0.86609799, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.28662109, + "step": 3733, + "time_per_iteration": 2.5308837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074406, + "balance_loss_mlp": 1.04589128, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.05588376049508018, + "language_loss": 0.80217636, + "learning_rate": 0.0001939946935515657, + "loss": 0.81292045, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.28515625, + "step": 3734, + "time_per_iteration": 2.8359925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077355, + "balance_loss_mlp": 1.04910207, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.0705810174200004, + "language_loss": 0.80242217, + "learning_rate": 0.0001937483698729755, + "loss": 0.81319571, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.28271484, + "step": 3735, + "time_per_iteration": 2.64072322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108005, + "balance_loss_mlp": 1.05070114, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.04976646958682061, + "language_loss": 0.81962895, + "learning_rate": 0.0001935021650918128, + "loss": 0.83042943, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.29321289, + "step": 3736, + "time_per_iteration": 3.0010826587677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04431319, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.062249035117782556, + "language_loss": 0.86910063, + "learning_rate": 0.0001932560793036625, + "loss": 0.87983465, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.29077148, + "step": 3737, + "time_per_iteration": 2.512890577316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.04766941, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.09579716691171304, + "language_loss": 0.86528683, + "learning_rate": 0.00019301011260406382, + "loss": 0.87604392, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.28051758, + "step": 3738, + "time_per_iteration": 2.624567985534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.04897833, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.050336885468814714, + "language_loss": 0.79622293, + "learning_rate": 0.00019276426508850936, + "loss": 0.80699408, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.28149414, + "step": 3739, + "time_per_iteration": 2.719663619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074558, + "balance_loss_mlp": 1.04597163, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.05223198929463843, + "language_loss": 0.80390334, + "learning_rate": 0.00019251853685244564, + "loss": 0.81464887, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.28564453, + "step": 3740, + "time_per_iteration": 3.006769895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076457, + "balance_loss_mlp": 1.048967, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.08129460448533303, + "language_loss": 0.80554307, + "learning_rate": 0.00019227292799127283, + "loss": 0.81630766, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.27539062, + "step": 3741, + "time_per_iteration": 3.0326223373413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.04560351, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.06791942956347788, + "language_loss": 0.78745782, + "learning_rate": 0.00019202743860034454, + "loss": 0.79819167, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.27807617, + "step": 3742, + "time_per_iteration": 3.2729034423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.04445601, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05486250950239536, + "language_loss": 0.83459806, + "learning_rate": 0.00019178206877496873, + "loss": 0.84531891, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.27636719, + "step": 3743, + "time_per_iteration": 2.7013559341430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070767, + "balance_loss_mlp": 1.04291999, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.04899238240269426, + "language_loss": 0.84932864, + "learning_rate": 0.0001915368186104059, + "loss": 0.86003625, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.27880859, + "step": 3744, + "time_per_iteration": 2.726893663406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073873, + "balance_loss_mlp": 1.04621649, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.06348773508617375, + "language_loss": 0.80724853, + "learning_rate": 0.0001912916882018706, + "loss": 0.81798726, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.27685547, + "step": 3745, + "time_per_iteration": 2.78125262260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073398, + "balance_loss_mlp": 1.0459559, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.06464144105655711, + "language_loss": 0.79121184, + "learning_rate": 0.00019104667764453125, + "loss": 0.80194581, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2746582, + "step": 3746, + "time_per_iteration": 3.033304214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.04549301, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.050415961986803856, + "language_loss": 0.80573905, + "learning_rate": 0.00019080178703350926, + "loss": 0.81646842, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.2746582, + "step": 3747, + "time_per_iteration": 2.6518349647521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074166, + "balance_loss_mlp": 1.0458895, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.07572692948457345, + "language_loss": 0.83004916, + "learning_rate": 0.00019055701646387952, + "loss": 0.84079087, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.28271484, + "step": 3748, + "time_per_iteration": 2.7013447284698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_mlp": 1.01970267, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.013786087553885988, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81504452, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.12060547, + "step": 3749, + "time_per_iteration": 4.794643878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.0453701, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05812194439124776, + "language_loss": 0.86448663, + "learning_rate": 0.00019006783582886368, + "loss": 0.87521857, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.27832031, + "step": 3750, + "time_per_iteration": 2.5275614261627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075263, + "balance_loss_mlp": 1.04653358, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.060767017514705764, + "language_loss": 0.82905239, + "learning_rate": 0.00018982342595339437, + "loss": 0.83980501, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.28686523, + "step": 3751, + "time_per_iteration": 3.522578239440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070907, + "balance_loss_mlp": 1.04239237, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05765271863237157, + "language_loss": 0.82075769, + "learning_rate": 0.00018957913649915076, + "loss": 0.83146673, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.28515625, + "step": 3752, + "time_per_iteration": 3.1765124797821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070534, + "balance_loss_mlp": 1.04187584, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.07973276687690374, + "language_loss": 0.79905254, + "learning_rate": 0.00018933496756097428, + "loss": 0.80975789, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.28662109, + "step": 3753, + "time_per_iteration": 2.625577926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074027, + "balance_loss_mlp": 1.04508317, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.06908288105531452, + "language_loss": 0.81582409, + "learning_rate": 0.0001890909192336603, + "loss": 0.82656443, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.28930664, + "step": 3754, + "time_per_iteration": 3.0871572494506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.04444289, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.057964315435078954, + "language_loss": 0.70292032, + "learning_rate": 0.00018884699161195623, + "loss": 0.71364796, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.28320312, + "step": 3755, + "time_per_iteration": 2.9729976654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072672, + "balance_loss_mlp": 1.0435853, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.07379868606686546, + "language_loss": 0.7706269, + "learning_rate": 0.00018860318479056327, + "loss": 0.78135359, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.29077148, + "step": 3756, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073497, + "balance_loss_mlp": 1.04491067, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.05587751331143294, + "language_loss": 0.83529603, + "learning_rate": 0.00018835949886413555, + "loss": 0.84603095, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.28588867, + "step": 3757, + "time_per_iteration": 2.6880505084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.04509711, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.08262826949591631, + "language_loss": 0.78295088, + "learning_rate": 0.0001881159339272806, + "loss": 0.7936939, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.29150391, + "step": 3758, + "time_per_iteration": 2.636491060256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04193068, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05735396724489517, + "language_loss": 0.78920448, + "learning_rate": 0.00018787249007455858, + "loss": 0.79990494, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.28100586, + "step": 3759, + "time_per_iteration": 2.5969340801239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_mlp": 1.04140496, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.07167982163737877, + "language_loss": 0.71580899, + "learning_rate": 0.00018762916740048302, + "loss": 0.72651339, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.28979492, + "step": 3760, + "time_per_iteration": 2.7852694988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071982, + "balance_loss_mlp": 1.04332376, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.05118431145994858, + "language_loss": 0.8598392, + "learning_rate": 0.0001873859659995195, + "loss": 0.87055904, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.28637695, + "step": 3761, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107496, + "balance_loss_mlp": 1.04639769, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.051413796044389046, + "language_loss": 0.83093852, + "learning_rate": 0.0001871428859660878, + "loss": 0.84168816, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.28564453, + "step": 3762, + "time_per_iteration": 2.7558627128601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107143, + "balance_loss_mlp": 1.04329658, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.057793734831364726, + "language_loss": 0.81882715, + "learning_rate": 0.00018689992739455975, + "loss": 0.82954144, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.28149414, + "step": 3763, + "time_per_iteration": 2.90240740776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070949, + "balance_loss_mlp": 1.04131389, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.047782863980039225, + "language_loss": 0.85763133, + "learning_rate": 0.00018665709037926027, + "loss": 0.86834085, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.29614258, + "step": 3764, + "time_per_iteration": 3.3121178150177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069943, + "balance_loss_mlp": 1.04157126, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06618029737842872, + "language_loss": 0.84513265, + "learning_rate": 0.00018641437501446694, + "loss": 0.8558321, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.28417969, + "step": 3765, + "time_per_iteration": 2.5711514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04172814, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.0702086558887849, + "language_loss": 0.82573164, + "learning_rate": 0.0001861717813944104, + "loss": 0.83643746, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.28833008, + "step": 3766, + "time_per_iteration": 2.6380386352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072686, + "balance_loss_mlp": 1.04386163, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.0720480056079547, + "language_loss": 0.79527569, + "learning_rate": 0.00018592930961327365, + "loss": 0.8060025, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.28833008, + "step": 3767, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.04238808, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.08594162637632567, + "language_loss": 0.87979633, + "learning_rate": 0.00018568695976519273, + "loss": 0.89051247, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.29199219, + "step": 3768, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072488, + "balance_loss_mlp": 1.04332972, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.06891867665937222, + "language_loss": 0.80339336, + "learning_rate": 0.00018544473194425593, + "loss": 0.81411815, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.29125977, + "step": 3769, + "time_per_iteration": 2.5053606033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.03942966, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.0628085761222727, + "language_loss": 0.78636301, + "learning_rate": 0.00018520262624450485, + "loss": 0.79704964, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.29174805, + "step": 3770, + "time_per_iteration": 2.8609566688537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073738, + "balance_loss_mlp": 1.0450325, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.04686882151976468, + "language_loss": 0.87040436, + "learning_rate": 0.00018496064275993324, + "loss": 0.88114178, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.28710938, + "step": 3771, + "time_per_iteration": 2.754624605178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067155, + "balance_loss_mlp": 1.03916478, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.06312025626452938, + "language_loss": 0.81491023, + "learning_rate": 0.00018471878158448686, + "loss": 0.82558179, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.2800293, + "step": 3772, + "time_per_iteration": 2.9370291233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074719, + "balance_loss_mlp": 1.04641891, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.04821073170159266, + "language_loss": 0.83998889, + "learning_rate": 0.00018447704281206512, + "loss": 0.85073608, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.28344727, + "step": 3773, + "time_per_iteration": 2.8460988998413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073582, + "balance_loss_mlp": 1.04382753, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.22097506803040057, + "language_loss": 0.82744718, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818305, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.29711914, + "step": 3774, + "time_per_iteration": 2.728426694869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107092, + "balance_loss_mlp": 1.04281068, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.06612065150918205, + "language_loss": 0.8084085, + "learning_rate": 0.0001839939328516526, + "loss": 0.81911772, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.28100586, + "step": 3775, + "time_per_iteration": 2.730315923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074711, + "balance_loss_mlp": 1.04631519, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.06548969982492862, + "language_loss": 0.81234205, + "learning_rate": 0.0001837525618512218, + "loss": 0.82308918, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.28369141, + "step": 3776, + "time_per_iteration": 2.8894991874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04159606, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.059408980610910087, + "language_loss": 0.8289094, + "learning_rate": 0.00018351131362893519, + "loss": 0.83960199, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.27685547, + "step": 3777, + "time_per_iteration": 2.829299211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072659, + "balance_loss_mlp": 1.04423952, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.07569647287253554, + "language_loss": 0.8052032, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592977, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.28417969, + "step": 3778, + "time_per_iteration": 2.605602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070858, + "balance_loss_mlp": 1.04279566, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.07105004265912586, + "language_loss": 0.87327212, + "learning_rate": 0.00018302918589339036, + "loss": 0.88398075, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.28051758, + "step": 3779, + "time_per_iteration": 2.644178628921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.04506147, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.05454287579555899, + "language_loss": 0.89820325, + "learning_rate": 0.00018278830656731054, + "loss": 0.90893972, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.28588867, + "step": 3780, + "time_per_iteration": 2.642853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067965, + "balance_loss_mlp": 1.03935504, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.049235223582258895, + "language_loss": 0.86383229, + "learning_rate": 0.00018254755039373222, + "loss": 0.87451196, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.28613281, + "step": 3781, + "time_per_iteration": 2.7858738899230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04377079, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.06238056381578398, + "language_loss": 0.8331604, + "learning_rate": 0.0001823069174661252, + "loss": 0.84388638, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.2878418, + "step": 3782, + "time_per_iteration": 2.7796318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075989, + "balance_loss_mlp": 1.0479033, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05705801102125677, + "language_loss": 0.78309739, + "learning_rate": 0.00018206640787791112, + "loss": 0.79385734, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.28125, + "step": 3783, + "time_per_iteration": 2.602808952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072706, + "balance_loss_mlp": 1.04411936, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.06294847174499694, + "language_loss": 0.85954249, + "learning_rate": 0.00018182602172246416, + "loss": 0.87026954, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.28588867, + "step": 3784, + "time_per_iteration": 2.6015853881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076895, + "balance_loss_mlp": 1.04823709, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06092859331592059, + "language_loss": 0.76170594, + "learning_rate": 0.00018158575909311075, + "loss": 0.77247488, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.28637695, + "step": 3785, + "time_per_iteration": 2.646030902862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05038452, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.06146036016272455, + "language_loss": 0.79553497, + "learning_rate": 0.000181345620083129, + "loss": 0.80632889, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.29003906, + "step": 3786, + "time_per_iteration": 2.792757034301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.0520606, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.04915125322890423, + "language_loss": 0.86502135, + "learning_rate": 0.00018110560478574927, + "loss": 0.87582016, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.27856445, + "step": 3787, + "time_per_iteration": 2.6800973415374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074424, + "balance_loss_mlp": 1.04538465, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.0704647078753348, + "language_loss": 0.80134165, + "learning_rate": 0.0001808657132941533, + "loss": 0.81208593, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.2902832, + "step": 3788, + "time_per_iteration": 2.770371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075695, + "balance_loss_mlp": 1.04741848, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.07634779758427546, + "language_loss": 0.8289668, + "learning_rate": 0.00018062594570147572, + "loss": 0.83972371, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.28295898, + "step": 3789, + "time_per_iteration": 2.5850260257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.05000448, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05162370165887138, + "language_loss": 0.85260105, + "learning_rate": 0.00018038630210080243, + "loss": 0.8633796, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.27880859, + "step": 3790, + "time_per_iteration": 2.837209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075748, + "balance_loss_mlp": 1.04744744, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.05876653681305703, + "language_loss": 0.849635, + "learning_rate": 0.0001801467825851712, + "loss": 0.86039245, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.28295898, + "step": 3791, + "time_per_iteration": 2.7689332962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.04778624, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.058290229022120006, + "language_loss": 0.7850548, + "learning_rate": 0.00017990738724757172, + "loss": 0.79581565, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.28320312, + "step": 3792, + "time_per_iteration": 2.870572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078653, + "balance_loss_mlp": 1.05092454, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.05184173418469221, + "language_loss": 0.81961739, + "learning_rate": 0.00017966811618094598, + "loss": 0.83040386, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.27758789, + "step": 3793, + "time_per_iteration": 2.9314723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078553, + "balance_loss_mlp": 1.05044341, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.061838028009129596, + "language_loss": 0.8480593, + "learning_rate": 0.00017942896947818664, + "loss": 0.85884488, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.28125, + "step": 3794, + "time_per_iteration": 2.5791871547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_mlp": 1.0351969, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.022620155773541276, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75872123, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.11865234, + "step": 3795, + "time_per_iteration": 4.875161647796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071538, + "balance_loss_mlp": 1.04383409, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07025171922085349, + "language_loss": 0.85040843, + "learning_rate": 0.00017895104953559947, + "loss": 0.8611238, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.27734375, + "step": 3796, + "time_per_iteration": 2.625335216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077716, + "balance_loss_mlp": 1.05027366, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.07017117998144913, + "language_loss": 0.89488584, + "learning_rate": 0.00017871227648131672, + "loss": 0.90566301, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.27490234, + "step": 3797, + "time_per_iteration": 2.4892690181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075327, + "balance_loss_mlp": 1.04743159, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.0555809148766967, + "language_loss": 0.82792765, + "learning_rate": 0.0001784736281619907, + "loss": 0.83868086, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.27905273, + "step": 3798, + "time_per_iteration": 2.616964101791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.04964578, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.06137974721906842, + "language_loss": 0.74274546, + "learning_rate": 0.00017823510467027232, + "loss": 0.75351775, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.27636719, + "step": 3799, + "time_per_iteration": 2.744365692138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074556, + "balance_loss_mlp": 1.04558766, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.06884438361049809, + "language_loss": 0.78208685, + "learning_rate": 0.00017799670609876516, + "loss": 0.79283237, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.28930664, + "step": 3800, + "time_per_iteration": 2.505571126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072835, + "balance_loss_mlp": 1.04465413, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.05034282557889911, + "language_loss": 0.88874984, + "learning_rate": 0.00017775843254002366, + "loss": 0.8994782, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.28222656, + "step": 3801, + "time_per_iteration": 2.7557313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076377, + "balance_loss_mlp": 1.04802942, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.053157012048244724, + "language_loss": 0.8399632, + "learning_rate": 0.00017752028408655367, + "loss": 0.85072702, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.28344727, + "step": 3802, + "time_per_iteration": 3.03664231300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074125, + "balance_loss_mlp": 1.04551435, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.05941466781290568, + "language_loss": 0.85240817, + "learning_rate": 0.00017728226083081272, + "loss": 0.8631494, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.28564453, + "step": 3803, + "time_per_iteration": 2.557260513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04554248, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.0569157917316084, + "language_loss": 0.8142879, + "learning_rate": 0.00017704436286520965, + "loss": 0.8250221, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.27929688, + "step": 3804, + "time_per_iteration": 2.531374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.04500246, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.0615002003094314, + "language_loss": 0.84243524, + "learning_rate": 0.0001768065902821046, + "loss": 0.85316658, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.28149414, + "step": 3805, + "time_per_iteration": 2.7219231128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070301, + "balance_loss_mlp": 1.04226291, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.050852375433721335, + "language_loss": 0.82159758, + "learning_rate": 0.00017656894317380907, + "loss": 0.83230054, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.28051758, + "step": 3806, + "time_per_iteration": 2.7360239028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019748, + "balance_loss_mlp": 1.00816071, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.009321700757662343, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77051014, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.11572266, + "step": 3807, + "time_per_iteration": 5.0339789390563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075379, + "balance_loss_mlp": 1.04662561, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.06770486672009031, + "language_loss": 0.83718252, + "learning_rate": 0.00017609402575064875, + "loss": 0.84793627, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.28710938, + "step": 3808, + "time_per_iteration": 2.5397021770477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073042, + "balance_loss_mlp": 1.04490852, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.07767281717141156, + "language_loss": 0.81099665, + "learning_rate": 0.00017585675562016367, + "loss": 0.8217271, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.28149414, + "step": 3809, + "time_per_iteration": 2.578652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019398, + "balance_loss_mlp": 1.00781119, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0100864336281573, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78232253, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.11572266, + "step": 3810, + "time_per_iteration": 4.869556903839111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04092479, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.16551466638387613, + "language_loss": 0.85115767, + "learning_rate": 0.00017538259298196474, + "loss": 0.861848, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.28100586, + "step": 3811, + "time_per_iteration": 2.5746755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074051, + "balance_loss_mlp": 1.04551268, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.05568772928725353, + "language_loss": 0.81749296, + "learning_rate": 0.00017514570065833745, + "loss": 0.82823348, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.28540039, + "step": 3812, + "time_per_iteration": 2.74574613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.04495704, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.06483425891488107, + "language_loss": 0.80511057, + "learning_rate": 0.00017490893445433426, + "loss": 0.81584549, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.28564453, + "step": 3813, + "time_per_iteration": 2.5976309776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.05026746, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.07334965322780891, + "language_loss": 0.81267703, + "learning_rate": 0.00017467229446187587, + "loss": 0.82346773, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.2878418, + "step": 3814, + "time_per_iteration": 2.6907997131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078482, + "balance_loss_mlp": 1.05044413, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.052639307044854956, + "language_loss": 0.81764507, + "learning_rate": 0.00017443578077283424, + "loss": 0.82842994, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.28027344, + "step": 3815, + "time_per_iteration": 2.65816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077176, + "balance_loss_mlp": 1.04882812, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.062049617931530306, + "language_loss": 0.84998393, + "learning_rate": 0.0001741993934790319, + "loss": 0.86075574, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.28344727, + "step": 3816, + "time_per_iteration": 2.738459348678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074176, + "balance_loss_mlp": 1.04594707, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.06367069815606033, + "language_loss": 0.8424527, + "learning_rate": 0.00017396313267224273, + "loss": 0.85319448, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.2824707, + "step": 3817, + "time_per_iteration": 2.7235686779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_mlp": 1.05144763, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.05690847114233298, + "language_loss": 0.88229644, + "learning_rate": 0.0001737269984441912, + "loss": 0.89309394, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.28320312, + "step": 3818, + "time_per_iteration": 2.664562225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_mlp": 1.05140162, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.059530599678457814, + "language_loss": 0.85132968, + "learning_rate": 0.00017349099088655263, + "loss": 0.86212027, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.27661133, + "step": 3819, + "time_per_iteration": 2.713716506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.05153477, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.07896802475478679, + "language_loss": 0.80594087, + "learning_rate": 0.00017325511009095375, + "loss": 0.81673896, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.28271484, + "step": 3820, + "time_per_iteration": 2.729605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075678, + "balance_loss_mlp": 1.04766417, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05267126362138293, + "language_loss": 0.83587992, + "learning_rate": 0.00017301935614897113, + "loss": 0.84663677, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.28051758, + "step": 3821, + "time_per_iteration": 2.6848647594451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.0488472, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.0534844061316339, + "language_loss": 0.81780893, + "learning_rate": 0.00017278372915213274, + "loss": 0.82857728, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.28027344, + "step": 3822, + "time_per_iteration": 2.650430679321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_mlp": 1.01945734, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.013429842271997025, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80925179, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.11865234, + "step": 3823, + "time_per_iteration": 4.986204385757446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_mlp": 1.05139256, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05755686388123544, + "language_loss": 0.80487376, + "learning_rate": 0.00017231285635975314, + "loss": 0.81566715, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.27929688, + "step": 3824, + "time_per_iteration": 2.952411413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107638, + "balance_loss_mlp": 1.04755485, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.0735633923389538, + "language_loss": 0.82809317, + "learning_rate": 0.00017207761074702115, + "loss": 0.83885694, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.28808594, + "step": 3825, + "time_per_iteration": 2.6093246936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05093431, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05450452025217221, + "language_loss": 0.83744037, + "learning_rate": 0.0001718424924450514, + "loss": 0.84824538, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.29516602, + "step": 3826, + "time_per_iteration": 2.625596046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072132, + "balance_loss_mlp": 1.04387975, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04900180424478287, + "language_loss": 0.85697591, + "learning_rate": 0.00017160750154512482, + "loss": 0.86769724, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.2824707, + "step": 3827, + "time_per_iteration": 4.115647554397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077067, + "balance_loss_mlp": 1.04912448, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.04912825481573526, + "language_loss": 0.83176559, + "learning_rate": 0.0001713726381384731, + "loss": 0.84253627, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.27954102, + "step": 3828, + "time_per_iteration": 2.794640302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04140913, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.06936682542859615, + "language_loss": 0.80874848, + "learning_rate": 0.00017113790231627812, + "loss": 0.81944889, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.28637695, + "step": 3829, + "time_per_iteration": 2.5032026767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_mlp": 1.01086962, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.00938038964712245, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80281258, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.12158203, + "step": 3830, + "time_per_iteration": 4.790278911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107551, + "balance_loss_mlp": 1.04701948, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05667126288905575, + "language_loss": 0.81707335, + "learning_rate": 0.00017066881378973936, + "loss": 0.82782841, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.28491211, + "step": 3831, + "time_per_iteration": 2.6234376430511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.0442524, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.05465479593854143, + "language_loss": 0.82744801, + "learning_rate": 0.00017043446126751189, + "loss": 0.83817565, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.28540039, + "step": 3832, + "time_per_iteration": 2.68343186378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.04089189, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.15091194873702685, + "language_loss": 0.76596999, + "learning_rate": 0.00017020023669397376, + "loss": 0.77666306, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.28442383, + "step": 3833, + "time_per_iteration": 2.709726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080144, + "balance_loss_mlp": 1.05141497, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.054777149599410456, + "language_loss": 0.81358391, + "learning_rate": 0.0001699661401600589, + "loss": 0.82438534, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.28759766, + "step": 3834, + "time_per_iteration": 2.5703024864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.04680145, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.05177646885601935, + "language_loss": 0.78090227, + "learning_rate": 0.00016973217175665205, + "loss": 0.79165161, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.28125, + "step": 3835, + "time_per_iteration": 2.567094564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_mlp": 1.02178645, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.015599325923103721, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8220011, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.12158203, + "step": 3836, + "time_per_iteration": 4.926120281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079166, + "balance_loss_mlp": 1.05046034, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.08209233600612638, + "language_loss": 0.83787167, + "learning_rate": 0.00016926461970465047, + "loss": 0.84866333, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.28710938, + "step": 3837, + "time_per_iteration": 2.8248865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.0512259, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.0447245395908081, + "language_loss": 0.84287, + "learning_rate": 0.00016903103623757516, + "loss": 0.85366273, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.28051758, + "step": 3838, + "time_per_iteration": 3.0732860565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04818845, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.060261467227696625, + "language_loss": 0.801202, + "learning_rate": 0.00016879758126404738, + "loss": 0.8119669, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.28295898, + "step": 3839, + "time_per_iteration": 2.6999428272247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081913, + "balance_loss_mlp": 1.05420828, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.0717530150127342, + "language_loss": 0.80011249, + "learning_rate": 0.00016856425487470216, + "loss": 0.81093156, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.27758789, + "step": 3840, + "time_per_iteration": 3.0798532962799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_mlp": 1.047153, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.06037669736072389, + "language_loss": 0.79319191, + "learning_rate": 0.00016833105716012486, + "loss": 0.80394864, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.28540039, + "step": 3841, + "time_per_iteration": 3.125180244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069813, + "balance_loss_mlp": 1.04144144, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.05821002881472178, + "language_loss": 0.84839195, + "learning_rate": 0.00016809798821085088, + "loss": 0.85909009, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.28344727, + "step": 3842, + "time_per_iteration": 2.9953746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.05303824, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.054657255359861566, + "language_loss": 0.89063728, + "learning_rate": 0.00016786504811736565, + "loss": 0.90145791, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.28979492, + "step": 3843, + "time_per_iteration": 2.7037930488586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077429, + "balance_loss_mlp": 1.04869962, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.06408695288095054, + "language_loss": 0.82701367, + "learning_rate": 0.00016763223697010442, + "loss": 0.83778793, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.28710938, + "step": 3844, + "time_per_iteration": 2.9637320041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107492, + "balance_loss_mlp": 1.0469532, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.05096747285284615, + "language_loss": 0.84036589, + "learning_rate": 0.00016739955485945256, + "loss": 0.85111511, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.2800293, + "step": 3845, + "time_per_iteration": 2.698608160018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255807, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.07070386524494449, + "language_loss": 0.85914421, + "learning_rate": 0.00016716700187574513, + "loss": 0.86985326, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.28369141, + "step": 3846, + "time_per_iteration": 2.686567544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075336, + "balance_loss_mlp": 1.04787064, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.09697778830761983, + "language_loss": 0.83608466, + "learning_rate": 0.0001669345781092675, + "loss": 0.846838, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.27490234, + "step": 3847, + "time_per_iteration": 2.705946445465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075753, + "balance_loss_mlp": 1.04742908, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.07758942034588075, + "language_loss": 0.87070894, + "learning_rate": 0.0001667022836502546, + "loss": 0.88146651, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.28320312, + "step": 3848, + "time_per_iteration": 2.727207899093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074969, + "balance_loss_mlp": 1.04657388, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.06324539449596041, + "language_loss": 0.82776666, + "learning_rate": 0.00016647011858889077, + "loss": 0.83851635, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.28369141, + "step": 3849, + "time_per_iteration": 2.552164077758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074172, + "balance_loss_mlp": 1.04577661, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.0765277016597007, + "language_loss": 0.86005962, + "learning_rate": 0.00016623808301531056, + "loss": 0.87080133, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.28417969, + "step": 3850, + "time_per_iteration": 2.6483278274536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073128, + "balance_loss_mlp": 1.04551888, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.06196174014296942, + "language_loss": 0.79140496, + "learning_rate": 0.00016600617701959842, + "loss": 0.8021363, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.27636719, + "step": 3851, + "time_per_iteration": 2.850390911102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_mlp": 1.01268303, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.012000469023036765, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79868609, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.12109375, + "step": 3852, + "time_per_iteration": 5.050019979476929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.08114806024349476, + "language_loss": 0.80909729, + "learning_rate": 0.00016554275412186315, + "loss": 0.8198458, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.28564453, + "step": 3853, + "time_per_iteration": 2.866884708404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04265463, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.09161546445880692, + "language_loss": 0.80530989, + "learning_rate": 0.0001653112373997568, + "loss": 0.8160221, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.28588867, + "step": 3854, + "time_per_iteration": 2.6828300952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075016, + "balance_loss_mlp": 1.04712129, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.06308625069628188, + "language_loss": 0.74284655, + "learning_rate": 0.0001650798506153517, + "loss": 0.75359672, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.27929688, + "step": 3855, + "time_per_iteration": 2.6935112476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.04473197, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.08209880324062359, + "language_loss": 0.84122801, + "learning_rate": 0.00016484859385848023, + "loss": 0.85195947, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.28442383, + "step": 3856, + "time_per_iteration": 2.620311975479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073651, + "balance_loss_mlp": 1.04501677, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.06689669498305581, + "language_loss": 0.76970744, + "learning_rate": 0.0001646174672189243, + "loss": 0.78044391, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.28613281, + "step": 3857, + "time_per_iteration": 2.6914920806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04087138, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.07125061218981377, + "language_loss": 0.80480021, + "learning_rate": 0.00016438647078641488, + "loss": 0.8154943, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.28515625, + "step": 3858, + "time_per_iteration": 2.6275553703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069955, + "balance_loss_mlp": 1.04103458, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.0650961492971168, + "language_loss": 0.83072245, + "learning_rate": 0.00016415560465063344, + "loss": 0.84142196, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.28930664, + "step": 3859, + "time_per_iteration": 2.732268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_mlp": 1.03886604, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07578384946449068, + "language_loss": 0.78930503, + "learning_rate": 0.0001639248689012095, + "loss": 0.79998553, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.29101562, + "step": 3860, + "time_per_iteration": 2.571627378463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_mlp": 1.04188704, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.06018469098837617, + "language_loss": 0.87730241, + "learning_rate": 0.00016369426362772271, + "loss": 0.88801575, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.29394531, + "step": 3861, + "time_per_iteration": 2.803495407104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04219532, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.05947124800099814, + "language_loss": 0.80541736, + "learning_rate": 0.00016346378891970233, + "loss": 0.81612754, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.28833008, + "step": 3862, + "time_per_iteration": 2.8671751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071209, + "balance_loss_mlp": 1.04183578, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.05726542490411253, + "language_loss": 0.80970359, + "learning_rate": 0.00016323344486662633, + "loss": 0.82041574, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.29345703, + "step": 3863, + "time_per_iteration": 3.310399055480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067129, + "balance_loss_mlp": 1.03808928, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05550567007056857, + "language_loss": 0.7837103, + "learning_rate": 0.00016300323155792247, + "loss": 0.79438156, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.29003906, + "step": 3864, + "time_per_iteration": 2.9007768630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065912, + "balance_loss_mlp": 1.03658676, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.0566624200483065, + "language_loss": 0.8859086, + "learning_rate": 0.00016277314908296687, + "loss": 0.8965677, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.29296875, + "step": 3865, + "time_per_iteration": 2.6249654293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.03741968, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.08514855435260649, + "language_loss": 0.76358485, + "learning_rate": 0.00016254319753108604, + "loss": 0.77424943, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.2902832, + "step": 3866, + "time_per_iteration": 2.816335678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070215, + "balance_loss_mlp": 1.04029381, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.06451588447838245, + "language_loss": 0.76624024, + "learning_rate": 0.00016231337699155492, + "loss": 0.77694237, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.29858398, + "step": 3867, + "time_per_iteration": 2.9624359607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068647, + "balance_loss_mlp": 1.03965509, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05724025816545972, + "language_loss": 0.78232771, + "learning_rate": 0.0001620836875535977, + "loss": 0.79301417, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.28930664, + "step": 3868, + "time_per_iteration": 2.847935199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064683, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.05959682093806377, + "language_loss": 0.8083024, + "learning_rate": 0.00016185412930638766, + "loss": 0.81894922, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.29614258, + "step": 3869, + "time_per_iteration": 2.8403937816619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066357, + "balance_loss_mlp": 1.03738952, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07528663769221765, + "language_loss": 0.82963836, + "learning_rate": 0.00016162470233904765, + "loss": 0.84030193, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.28955078, + "step": 3870, + "time_per_iteration": 2.7301175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_mlp": 1.03685129, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.055174574386506046, + "language_loss": 0.8203845, + "learning_rate": 0.00016139540674064856, + "loss": 0.83104366, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.2902832, + "step": 3871, + "time_per_iteration": 2.728790760040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070553, + "balance_loss_mlp": 1.0411799, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.05299342012379109, + "language_loss": 0.77625883, + "learning_rate": 0.00016116624260021113, + "loss": 0.78696442, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.29321289, + "step": 3872, + "time_per_iteration": 2.7653627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064492, + "balance_loss_mlp": 1.0351187, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.05882503001296847, + "language_loss": 0.8393743, + "learning_rate": 0.0001609372100067046, + "loss": 0.85001922, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.29345703, + "step": 3873, + "time_per_iteration": 2.556082010269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_mlp": 1.03318477, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.0629532265793869, + "language_loss": 0.84404862, + "learning_rate": 0.0001607083090490475, + "loss": 0.85467416, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.29296875, + "step": 3874, + "time_per_iteration": 2.8703696727752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_mlp": 1.0391767, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.07079518805711353, + "language_loss": 0.79695952, + "learning_rate": 0.00016047953981610714, + "loss": 0.80764002, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.28857422, + "step": 3875, + "time_per_iteration": 2.7114357948303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006736, + "balance_loss_mlp": 0.99467212, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.007120969619793637, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736375, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.12060547, + "step": 3876, + "time_per_iteration": 4.9630632400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.06112785741663116, + "language_loss": 0.81022239, + "learning_rate": 0.0001600223968795889, + "loss": 0.82083988, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.29394531, + "step": 3877, + "time_per_iteration": 2.8622119426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006979, + "balance_loss_mlp": 0.99501073, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.005911171092350221, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76703048, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.11962891, + "step": 3878, + "time_per_iteration": 4.92147159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064327, + "balance_loss_mlp": 1.03521585, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.0740832902187226, + "language_loss": 0.81523597, + "learning_rate": 0.00015956578190706483, + "loss": 0.82587922, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.29077148, + "step": 3879, + "time_per_iteration": 2.673748016357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_mlp": 1.03529429, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05926630999911606, + "language_loss": 0.75906825, + "learning_rate": 0.00015933767262892468, + "loss": 0.76971918, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.29760742, + "step": 3880, + "time_per_iteration": 2.7114145755767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069606, + "balance_loss_mlp": 1.03937459, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.07620522972756824, + "language_loss": 0.81981504, + "learning_rate": 0.00015910969560762927, + "loss": 0.83051109, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.30175781, + "step": 3881, + "time_per_iteration": 2.5965123176574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_mlp": 1.03790677, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05603078059754119, + "language_loss": 0.83325368, + "learning_rate": 0.00015888185093168727, + "loss": 0.84393334, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.30053711, + "step": 3882, + "time_per_iteration": 2.732828378677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03709519, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06025549136597994, + "language_loss": 0.8122552, + "learning_rate": 0.00015865413868955581, + "loss": 0.82292587, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.29931641, + "step": 3883, + "time_per_iteration": 2.6130521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03577161, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.0544206071008422, + "language_loss": 0.8260529, + "learning_rate": 0.00015842655896964054, + "loss": 0.83671433, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.30322266, + "step": 3884, + "time_per_iteration": 3.0686898231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03912604, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.07023161322090775, + "language_loss": 0.73560184, + "learning_rate": 0.00015819911186029567, + "loss": 0.7462911, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.29785156, + "step": 3885, + "time_per_iteration": 2.7895405292510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067979, + "balance_loss_mlp": 1.03808117, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.059238744927090525, + "language_loss": 0.86428809, + "learning_rate": 0.00015797179744982443, + "loss": 0.87496781, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.29833984, + "step": 3886, + "time_per_iteration": 2.7247395515441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_mlp": 1.03986931, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.04858811748134261, + "language_loss": 0.78711867, + "learning_rate": 0.00015774461582647765, + "loss": 0.79780704, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.28930664, + "step": 3887, + "time_per_iteration": 2.633619785308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066597, + "balance_loss_mlp": 1.0372951, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.06558254439957789, + "language_loss": 0.80900019, + "learning_rate": 0.00015751756707845505, + "loss": 0.81966615, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.29272461, + "step": 3888, + "time_per_iteration": 2.606644630432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.03703403, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.05503127509914209, + "language_loss": 0.88178474, + "learning_rate": 0.00015729065129390502, + "loss": 0.89244807, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.29296875, + "step": 3889, + "time_per_iteration": 2.997523784637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067289, + "balance_loss_mlp": 1.03891718, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.06469395023850445, + "language_loss": 0.82209432, + "learning_rate": 0.0001570638685609241, + "loss": 0.83276725, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.28369141, + "step": 3890, + "time_per_iteration": 2.569988250732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106895, + "balance_loss_mlp": 1.03950548, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.06811331087467534, + "language_loss": 0.80319339, + "learning_rate": 0.00015683721896755693, + "loss": 0.81388295, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.29443359, + "step": 3891, + "time_per_iteration": 2.5164339542388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026235, + "balance_loss_mlp": 1.01455247, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.016089611749753062, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.8323673, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.11669922, + "step": 3892, + "time_per_iteration": 4.94329047203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.05717636586120892, + "language_loss": 0.85079896, + "learning_rate": 0.00015638431955158528, + "loss": 0.86151218, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.28588867, + "step": 3893, + "time_per_iteration": 2.6895976066589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_mlp": 1.03823924, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.05490928633036113, + "language_loss": 0.80953169, + "learning_rate": 0.00015615806990481186, + "loss": 0.82020867, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.29394531, + "step": 3894, + "time_per_iteration": 2.7377114295959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066836, + "balance_loss_mlp": 1.03796339, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.04620973196436286, + "language_loss": 0.843225, + "learning_rate": 0.00015593195374931452, + "loss": 0.8538934, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.28808594, + "step": 3895, + "time_per_iteration": 2.7463459968566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066974, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.06172140758760985, + "language_loss": 0.79870188, + "learning_rate": 0.00015570597117287922, + "loss": 0.80937159, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.29125977, + "step": 3896, + "time_per_iteration": 2.698322057723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065177, + "balance_loss_mlp": 1.03585148, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.06184521079833043, + "language_loss": 0.77818131, + "learning_rate": 0.0001554801222632406, + "loss": 0.78883302, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.29296875, + "step": 3897, + "time_per_iteration": 2.5883569717407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_mlp": 1.03872728, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.05373326836284952, + "language_loss": 0.8491286, + "learning_rate": 0.00015525440710808052, + "loss": 0.85980201, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.28588867, + "step": 3898, + "time_per_iteration": 2.628744125366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063318, + "balance_loss_mlp": 1.03415978, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.060715179246677825, + "language_loss": 0.77859104, + "learning_rate": 0.00015502882579502953, + "loss": 0.78922421, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.29101562, + "step": 3899, + "time_per_iteration": 2.9461636543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106545, + "balance_loss_mlp": 1.03576672, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.04885018850646455, + "language_loss": 0.84403229, + "learning_rate": 0.00015480337841166592, + "loss": 0.85468674, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.29638672, + "step": 3900, + "time_per_iteration": 2.712470531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071559, + "balance_loss_mlp": 1.04287767, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.062426881340490126, + "language_loss": 0.83192408, + "learning_rate": 0.00015457806504551647, + "loss": 0.84263968, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.28686523, + "step": 3901, + "time_per_iteration": 2.8195760250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065, + "balance_loss_mlp": 1.0360322, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.11477974594715189, + "language_loss": 0.78299713, + "learning_rate": 0.0001543528857840554, + "loss": 0.79364717, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.28955078, + "step": 3902, + "time_per_iteration": 2.630005121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069882, + "balance_loss_mlp": 1.04155791, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.06709872205496833, + "language_loss": 0.80052483, + "learning_rate": 0.000154127840714705, + "loss": 0.81122363, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.28320312, + "step": 3903, + "time_per_iteration": 2.7631478309631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.03574109, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0656362631946546, + "language_loss": 0.81441653, + "learning_rate": 0.00015390292992483557, + "loss": 0.82506168, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.28759766, + "step": 3904, + "time_per_iteration": 2.5295097827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_mlp": 1.0401783, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.05357678642302426, + "language_loss": 0.84239411, + "learning_rate": 0.00015367815350176523, + "loss": 0.85308868, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.29223633, + "step": 3905, + "time_per_iteration": 2.774902582168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.03674817, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.052651193007747205, + "language_loss": 0.82780552, + "learning_rate": 0.00015345351153275987, + "loss": 0.83846122, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.28808594, + "step": 3906, + "time_per_iteration": 2.514157772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068641, + "balance_loss_mlp": 1.03964877, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05447043379457725, + "language_loss": 0.80753815, + "learning_rate": 0.00015322900410503332, + "loss": 0.81822455, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.28955078, + "step": 3907, + "time_per_iteration": 2.8011515140533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.04150474, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.13484252880290531, + "language_loss": 0.77137792, + "learning_rate": 0.00015300463130574703, + "loss": 0.78208047, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.28710938, + "step": 3908, + "time_per_iteration": 2.8607709407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068663, + "balance_loss_mlp": 1.03983819, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.04704882043674688, + "language_loss": 0.82268852, + "learning_rate": 0.00015278039322201033, + "loss": 0.8333751, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.28808594, + "step": 3909, + "time_per_iteration": 2.9650497436523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_mlp": 1.04047048, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.0655524275561889, + "language_loss": 0.79742765, + "learning_rate": 0.00015255628994088004, + "loss": 0.80810893, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.27685547, + "step": 3910, + "time_per_iteration": 2.5476014614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073189, + "balance_loss_mlp": 1.04410195, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.059223553783327845, + "language_loss": 0.74873102, + "learning_rate": 0.00015233232154936082, + "loss": 0.75946289, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.29101562, + "step": 3911, + "time_per_iteration": 3.244593858718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04204392, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.05757806259910298, + "language_loss": 0.76233411, + "learning_rate": 0.0001521084881344048, + "loss": 0.77303445, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.27978516, + "step": 3912, + "time_per_iteration": 2.874175548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.03988528, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.058305123662607664, + "language_loss": 0.8657366, + "learning_rate": 0.00015188478978291208, + "loss": 0.87642109, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.28564453, + "step": 3913, + "time_per_iteration": 2.76914119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072508, + "balance_loss_mlp": 1.04387414, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05696914319302461, + "language_loss": 0.8621434, + "learning_rate": 0.00015166122658173014, + "loss": 0.87286842, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.28637695, + "step": 3914, + "time_per_iteration": 2.7666819095611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069582, + "balance_loss_mlp": 1.04121017, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.05613078933144466, + "language_loss": 0.88230741, + "learning_rate": 0.00015143779861765332, + "loss": 0.89300323, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.28369141, + "step": 3915, + "time_per_iteration": 2.9440953731536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068792, + "balance_loss_mlp": 1.04058695, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.0540096565314657, + "language_loss": 0.81303173, + "learning_rate": 0.00015121450597742458, + "loss": 0.82371962, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.28198242, + "step": 3916, + "time_per_iteration": 2.8476526737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03871989, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.0625846652791648, + "language_loss": 0.78284335, + "learning_rate": 0.00015099134874773369, + "loss": 0.79351616, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.28613281, + "step": 3917, + "time_per_iteration": 2.7236275672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_mlp": 1.03791249, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.06623718225432344, + "language_loss": 0.80174196, + "learning_rate": 0.00015076832701521793, + "loss": 0.81240696, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.28588867, + "step": 3918, + "time_per_iteration": 2.7410969734191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_mlp": 1.04238045, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06658372042006708, + "language_loss": 0.81702781, + "learning_rate": 0.000150545440866462, + "loss": 0.82773727, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.28540039, + "step": 3919, + "time_per_iteration": 2.9761922359466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_mlp": 1.04143584, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.07410111643216553, + "language_loss": 0.78494799, + "learning_rate": 0.000150322690387998, + "loss": 0.79563987, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.27758789, + "step": 3920, + "time_per_iteration": 2.516460657119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.04316044, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05131276366098942, + "language_loss": 0.74961436, + "learning_rate": 0.00015010007566630535, + "loss": 0.76033103, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.28491211, + "step": 3921, + "time_per_iteration": 2.7329115867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071062, + "balance_loss_mlp": 1.04383469, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.07801712247115837, + "language_loss": 0.81558347, + "learning_rate": 0.00014987759678781077, + "loss": 0.82629412, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.27246094, + "step": 3922, + "time_per_iteration": 2.611708641052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071556, + "balance_loss_mlp": 1.04370856, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05153768257221068, + "language_loss": 0.82422328, + "learning_rate": 0.00014965525383888795, + "loss": 0.83493882, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.27856445, + "step": 3923, + "time_per_iteration": 2.7729198932647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072323, + "balance_loss_mlp": 1.04433274, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.0575234231525959, + "language_loss": 0.7209577, + "learning_rate": 0.00014943304690585851, + "loss": 0.73168093, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.2800293, + "step": 3924, + "time_per_iteration": 2.9442129135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071679, + "balance_loss_mlp": 1.04378402, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.07421500953939195, + "language_loss": 0.79421008, + "learning_rate": 0.0001492109760749908, + "loss": 0.80492687, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.27905273, + "step": 3925, + "time_per_iteration": 2.643162965774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071892, + "balance_loss_mlp": 1.04392564, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.059903848409534166, + "language_loss": 0.79955506, + "learning_rate": 0.00014898904143250002, + "loss": 0.81027395, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.27978516, + "step": 3926, + "time_per_iteration": 2.6683785915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013114, + "balance_loss_mlp": 1.00157464, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.014723160486699832, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76768315, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.11523438, + "step": 3927, + "time_per_iteration": 4.920205354690552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071852, + "balance_loss_mlp": 1.04331291, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.05563270173237852, + "language_loss": 0.80196631, + "learning_rate": 0.0001485455810572474, + "loss": 0.81268483, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.28540039, + "step": 3928, + "time_per_iteration": 2.6541106700897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073533, + "balance_loss_mlp": 1.04499388, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.04999178273670638, + "language_loss": 0.84088999, + "learning_rate": 0.00014832405549665236, + "loss": 0.85162532, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.28564453, + "step": 3929, + "time_per_iteration": 2.6799492835998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070648, + "balance_loss_mlp": 1.04203749, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.061253165396126415, + "language_loss": 0.78636932, + "learning_rate": 0.00014810266646876746, + "loss": 0.79707581, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.28613281, + "step": 3930, + "time_per_iteration": 2.7644495964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068729, + "balance_loss_mlp": 1.03980851, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.0768252646204266, + "language_loss": 0.77379584, + "learning_rate": 0.00014788141405954364, + "loss": 0.78448313, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.28930664, + "step": 3931, + "time_per_iteration": 2.996284246444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_mlp": 1.04418492, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.07792136157882237, + "language_loss": 0.84719956, + "learning_rate": 0.00014766029835487865, + "loss": 0.85792696, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.28564453, + "step": 3932, + "time_per_iteration": 2.7055630683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010723, + "balance_loss_mlp": 1.04326117, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.0830870815556461, + "language_loss": 0.79488772, + "learning_rate": 0.0001474393194406173, + "loss": 0.80561072, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.29052734, + "step": 3933, + "time_per_iteration": 2.8866286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04583836, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.06997934005865011, + "language_loss": 0.79262674, + "learning_rate": 0.00014721847740255112, + "loss": 0.80337715, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.29174805, + "step": 3934, + "time_per_iteration": 2.8177120685577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013149, + "balance_loss_mlp": 1.00151432, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.018539216642102736, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74925071, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.11621094, + "step": 3935, + "time_per_iteration": 4.663410186767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070009, + "balance_loss_mlp": 1.04085028, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08081636486404137, + "language_loss": 0.7884202, + "learning_rate": 0.00014677720429790526, + "loss": 0.79912031, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.29125977, + "step": 3936, + "time_per_iteration": 2.5801281929016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106807, + "balance_loss_mlp": 1.03791022, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.05183566311050574, + "language_loss": 0.8430894, + "learning_rate": 0.0001465567734026429, + "loss": 0.85377008, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.30126953, + "step": 3937, + "time_per_iteration": 2.711367607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.0420028, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.061048992240079196, + "language_loss": 0.82235777, + "learning_rate": 0.00014633647972621034, + "loss": 0.83307326, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.29492188, + "step": 3938, + "time_per_iteration": 2.4616081714630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_mlp": 1.03861201, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.05374365085178841, + "language_loss": 0.86112857, + "learning_rate": 0.00014611632335413354, + "loss": 0.87180108, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.28637695, + "step": 3939, + "time_per_iteration": 2.815455436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061142, + "balance_loss_mlp": 1.03296053, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05753060969911492, + "language_loss": 0.82291019, + "learning_rate": 0.00014589630437188456, + "loss": 0.8335216, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.28222656, + "step": 3940, + "time_per_iteration": 3.190596580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_mlp": 1.03698504, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.07206463977261317, + "language_loss": 0.78593653, + "learning_rate": 0.00014567642286488253, + "loss": 0.79659057, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.28466797, + "step": 3941, + "time_per_iteration": 2.5607380867004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04073191, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.06381401552287866, + "language_loss": 0.79120469, + "learning_rate": 0.00014545667891849258, + "loss": 0.80191505, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.30249023, + "step": 3942, + "time_per_iteration": 2.6117217540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04192472, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05226186971292142, + "language_loss": 0.82272542, + "learning_rate": 0.00014523707261802733, + "loss": 0.83343649, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.29174805, + "step": 3943, + "time_per_iteration": 2.665384292602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04448068, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.07358446075620559, + "language_loss": 0.81266546, + "learning_rate": 0.00014501760404874527, + "loss": 0.823394, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.28344727, + "step": 3944, + "time_per_iteration": 2.723860263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_mlp": 1.04698288, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.059139493232711386, + "language_loss": 0.85488701, + "learning_rate": 0.00014479827329585176, + "loss": 0.86564749, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.29052734, + "step": 3945, + "time_per_iteration": 2.6966402530670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070791, + "balance_loss_mlp": 1.04260945, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.05454852499248085, + "language_loss": 0.84753144, + "learning_rate": 0.00014457908044449846, + "loss": 0.85823941, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.28173828, + "step": 3946, + "time_per_iteration": 2.751542329788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_mlp": 1.0412122, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.057352771815407315, + "language_loss": 0.82947516, + "learning_rate": 0.00014436002557978371, + "loss": 0.84016603, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.27856445, + "step": 3947, + "time_per_iteration": 2.8199281692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_mlp": 1.0139817, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01569529231199887, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77667999, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.10888672, + "step": 3948, + "time_per_iteration": 4.886767387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071405, + "balance_loss_mlp": 1.04217458, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.052184618076363286, + "language_loss": 0.79761183, + "learning_rate": 0.0001439223301503945, + "loss": 0.80832583, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.29223633, + "step": 3949, + "time_per_iteration": 2.524615526199341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107458, + "balance_loss_mlp": 1.04644656, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.06319987538441409, + "language_loss": 0.76281846, + "learning_rate": 0.00014370368975564834, + "loss": 0.77356422, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.28112793, + "step": 3950, + "time_per_iteration": 2.9306294918060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073257, + "balance_loss_mlp": 1.045434, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.07868227598634299, + "language_loss": 0.83049744, + "learning_rate": 0.00014348518768739766, + "loss": 0.84123003, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.27832031, + "step": 3951, + "time_per_iteration": 2.7313663959503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_mlp": 1.01646149, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.015467940128204082, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77755326, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.10839844, + "step": 3952, + "time_per_iteration": 4.869096994400024 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107331, + "balance_loss_mlp": 1.04593956, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.05530347415553069, + "language_loss": 0.86385798, + "learning_rate": 0.00014304859886964867, + "loss": 0.87459111, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.27441406, + "step": 3953, + "time_per_iteration": 3.04145884513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.04591215, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.05036114884340379, + "language_loss": 0.83556843, + "learning_rate": 0.00014283051228964878, + "loss": 0.8463015, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.27416992, + "step": 3954, + "time_per_iteration": 2.694143772125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072565, + "balance_loss_mlp": 1.0455761, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.07332559246133831, + "language_loss": 0.82520175, + "learning_rate": 0.00014261256437514197, + "loss": 0.83592749, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.27026367, + "step": 3955, + "time_per_iteration": 2.644928455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081954, + "balance_loss_mlp": 1.05405927, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.0938811683144382, + "language_loss": 0.82110238, + "learning_rate": 0.0001423947552107428, + "loss": 0.83192188, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.27929688, + "step": 3956, + "time_per_iteration": 2.7390809059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.0495677, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.058156679645763765, + "language_loss": 0.77027428, + "learning_rate": 0.00014217708488101243, + "loss": 0.78105605, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.28637695, + "step": 3957, + "time_per_iteration": 3.068586587905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0505054, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.051838175229669575, + "language_loss": 0.76812273, + "learning_rate": 0.0001419595534704579, + "loss": 0.77891129, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.28369141, + "step": 3958, + "time_per_iteration": 2.6755166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078829, + "balance_loss_mlp": 1.05176806, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.08007848421566002, + "language_loss": 0.80974507, + "learning_rate": 0.00014174216106353237, + "loss": 0.82053339, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.27124023, + "step": 3959, + "time_per_iteration": 2.6076533794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077365, + "balance_loss_mlp": 1.04985189, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05778330536162942, + "language_loss": 0.75894332, + "learning_rate": 0.00014152490774463512, + "loss": 0.76971698, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.27539062, + "step": 3960, + "time_per_iteration": 2.690720558166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079227, + "balance_loss_mlp": 1.05211914, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07078023204432035, + "language_loss": 0.86778873, + "learning_rate": 0.00014130779359811135, + "loss": 0.87858105, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.27148438, + "step": 3961, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074576, + "balance_loss_mlp": 1.04672933, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.053637952879954945, + "language_loss": 0.85656244, + "learning_rate": 0.0001410908187082521, + "loss": 0.86730814, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.27856445, + "step": 3962, + "time_per_iteration": 2.8493921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073404, + "balance_loss_mlp": 1.04527116, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.06361910700745704, + "language_loss": 0.82962865, + "learning_rate": 0.0001408739831592949, + "loss": 0.84036273, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.28149414, + "step": 3963, + "time_per_iteration": 2.670091152191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04530358, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06318704886131189, + "language_loss": 0.77098757, + "learning_rate": 0.0001406572870354224, + "loss": 0.78171021, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.27001953, + "step": 3964, + "time_per_iteration": 2.8136370182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076051, + "balance_loss_mlp": 1.04758406, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.08123777777865493, + "language_loss": 0.87067986, + "learning_rate": 0.00014044073042076337, + "loss": 0.88144034, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.28491211, + "step": 3965, + "time_per_iteration": 2.601212739944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077118, + "balance_loss_mlp": 1.04948556, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.044562098322040423, + "language_loss": 0.88958192, + "learning_rate": 0.00014022431339939302, + "loss": 0.90035319, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.27636719, + "step": 3966, + "time_per_iteration": 2.6651570796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_mlp": 1.04119754, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.09228261412980937, + "language_loss": 0.77959037, + "learning_rate": 0.00014000803605533163, + "loss": 0.79028082, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.27856445, + "step": 3967, + "time_per_iteration": 2.8413825035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.04367566, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.08332228620070425, + "language_loss": 0.83150613, + "learning_rate": 0.00013979189847254553, + "loss": 0.8422159, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.27294922, + "step": 3968, + "time_per_iteration": 2.578245162963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071362, + "balance_loss_mlp": 1.04282331, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.06392054280336681, + "language_loss": 0.80515426, + "learning_rate": 0.00013957590073494674, + "loss": 0.8158679, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.28540039, + "step": 3969, + "time_per_iteration": 2.7899181842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069143, + "balance_loss_mlp": 1.04182076, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.08725250729100972, + "language_loss": 0.7866261, + "learning_rate": 0.0001393600429263931, + "loss": 0.7973175, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.2734375, + "step": 3970, + "time_per_iteration": 2.7429044246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010211, + "balance_loss_mlp": 0.99867129, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.0172148744606984, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75755095, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.11523438, + "step": 3971, + "time_per_iteration": 4.9502363204956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03834224, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05751268278268784, + "language_loss": 0.81411171, + "learning_rate": 0.0001389287474315804, + "loss": 0.8247757, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.28076172, + "step": 3972, + "time_per_iteration": 2.6566832065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_mlp": 1.04213631, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05008758615727923, + "language_loss": 0.8002165, + "learning_rate": 0.00013871330991276505, + "loss": 0.81092072, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.28295898, + "step": 3973, + "time_per_iteration": 2.7023086547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071476, + "balance_loss_mlp": 1.04334247, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.061481835950818894, + "language_loss": 0.80452615, + "learning_rate": 0.00013849801265788247, + "loss": 0.81524092, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.28149414, + "step": 3974, + "time_per_iteration": 2.997316837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067079, + "balance_loss_mlp": 1.03861213, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.07226378616877399, + "language_loss": 0.82833815, + "learning_rate": 0.00013828285575051818, + "loss": 0.83900893, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.28466797, + "step": 3975, + "time_per_iteration": 2.588979721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.03437066, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.06463560472951296, + "language_loss": 0.83791184, + "learning_rate": 0.0001380678392742035, + "loss": 0.84853303, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.27783203, + "step": 3976, + "time_per_iteration": 2.734581708908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061699, + "balance_loss_mlp": 1.03378069, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.05082413379641715, + "language_loss": 0.84568453, + "learning_rate": 0.00013785296331241526, + "loss": 0.85630155, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.27954102, + "step": 3977, + "time_per_iteration": 2.9020192623138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065476, + "balance_loss_mlp": 1.03727102, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.0974531570465959, + "language_loss": 0.86962479, + "learning_rate": 0.00013763822794857583, + "loss": 0.88027954, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.28222656, + "step": 3978, + "time_per_iteration": 3.2940611839294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066351, + "balance_loss_mlp": 1.03847969, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.06678664441020601, + "language_loss": 0.89705759, + "learning_rate": 0.00013742363326605278, + "loss": 0.9077211, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.27880859, + "step": 3979, + "time_per_iteration": 2.717656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064419, + "balance_loss_mlp": 1.03473556, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.10335635669358377, + "language_loss": 0.78531003, + "learning_rate": 0.00013720917934815935, + "loss": 0.79595423, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.296875, + "step": 3980, + "time_per_iteration": 2.7627711296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.03960097, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.07286561915101249, + "language_loss": 0.82861632, + "learning_rate": 0.00013699486627815344, + "loss": 0.83929539, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.28295898, + "step": 3981, + "time_per_iteration": 2.612478494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068523, + "balance_loss_mlp": 1.04024673, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.05570598750158071, + "language_loss": 0.82202697, + "learning_rate": 0.00013678069413923928, + "loss": 0.83271217, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.28320312, + "step": 3982, + "time_per_iteration": 2.586998701095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067122, + "balance_loss_mlp": 1.03844047, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.07121708811283338, + "language_loss": 0.81735259, + "learning_rate": 0.00013656666301456555, + "loss": 0.82802379, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.28662109, + "step": 3983, + "time_per_iteration": 2.574695587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066984, + "balance_loss_mlp": 1.03906524, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.055314975613937604, + "language_loss": 0.83996785, + "learning_rate": 0.0001363527729872267, + "loss": 0.85063773, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.27929688, + "step": 3984, + "time_per_iteration": 2.6829311847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069719, + "balance_loss_mlp": 1.04191911, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.061166263195475266, + "language_loss": 0.76441991, + "learning_rate": 0.00013613902414026207, + "loss": 0.77511704, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.27832031, + "step": 3985, + "time_per_iteration": 2.7802467346191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067154, + "balance_loss_mlp": 1.03947425, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.05402447635552578, + "language_loss": 0.82339627, + "learning_rate": 0.00013592541655665642, + "loss": 0.83406782, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.27709961, + "step": 3986, + "time_per_iteration": 2.9866812229156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255819, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.07328879507268711, + "language_loss": 0.85332406, + "learning_rate": 0.00013571195031933947, + "loss": 0.86403316, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.28320312, + "step": 3987, + "time_per_iteration": 2.673912525177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016637, + "balance_loss_mlp": 1.00533557, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.005208486185004438, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81497979, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.11279297, + "step": 3988, + "time_per_iteration": 4.698279619216919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_mlp": 1.04217589, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.06677874529098146, + "language_loss": 0.85441434, + "learning_rate": 0.00013528544221501655, + "loss": 0.86511409, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.27832031, + "step": 3989, + "time_per_iteration": 2.7262814044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079413, + "balance_loss_mlp": 1.05132711, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.06376913662917556, + "language_loss": 0.81445122, + "learning_rate": 0.00013507240051359586, + "loss": 0.82524538, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.28100586, + "step": 3990, + "time_per_iteration": 3.0680136680603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076429, + "balance_loss_mlp": 1.04944038, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.06248947721820998, + "language_loss": 0.85939497, + "learning_rate": 0.00013485950048963425, + "loss": 0.87015927, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.27026367, + "step": 3991, + "time_per_iteration": 2.652700424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.04629004, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.05838140649114419, + "language_loss": 0.82813108, + "learning_rate": 0.00013464674222578643, + "loss": 0.83886003, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.26660156, + "step": 3992, + "time_per_iteration": 3.199664354324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078059, + "balance_loss_mlp": 1.05028319, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.060819943301615054, + "language_loss": 0.8307544, + "learning_rate": 0.00013443412580465292, + "loss": 0.84153497, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.27783203, + "step": 3993, + "time_per_iteration": 2.6216468811035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077791, + "balance_loss_mlp": 1.04999137, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.05683440391019819, + "language_loss": 0.83944607, + "learning_rate": 0.00013422165130877857, + "loss": 0.85022402, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.27807617, + "step": 3994, + "time_per_iteration": 2.8932595252990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.05011749, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.058104534387139244, + "language_loss": 0.80272782, + "learning_rate": 0.00013400931882065327, + "loss": 0.81350249, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.27392578, + "step": 3995, + "time_per_iteration": 2.6307244300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.04768717, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.08323850441020555, + "language_loss": 0.80980253, + "learning_rate": 0.0001337971284227118, + "loss": 0.82056189, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.28222656, + "step": 3996, + "time_per_iteration": 3.022775411605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_mlp": 1.01415932, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.008597329334489423, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.7714355, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.10986328, + "step": 3997, + "time_per_iteration": 4.959140777587891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073396, + "balance_loss_mlp": 1.0458113, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05719845249799778, + "language_loss": 0.80268121, + "learning_rate": 0.0001333731742268438, + "loss": 0.81341517, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.27636719, + "step": 3998, + "time_per_iteration": 2.6925253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078887, + "balance_loss_mlp": 1.05142081, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05688018347037518, + "language_loss": 0.85395527, + "learning_rate": 0.0001331614105935109, + "loss": 0.86474419, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.27514648, + "step": 3999, + "time_per_iteration": 2.653233051300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076072, + "balance_loss_mlp": 1.04843915, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.05160358655207702, + "language_loss": 0.84470475, + "learning_rate": 0.00013294978937954883, + "loss": 0.85546547, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.27685547, + "step": 4000, + "time_per_iteration": 2.776451349258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073445, + "balance_loss_mlp": 1.04562187, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.08124921192431957, + "language_loss": 0.8516435, + "learning_rate": 0.00013273831066711655, + "loss": 0.862378, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.27856445, + "step": 4001, + "time_per_iteration": 2.624626874923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075558, + "balance_loss_mlp": 1.04903352, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.06596404445695028, + "language_loss": 0.79911482, + "learning_rate": 0.00013252697453831747, + "loss": 0.80987036, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.26574707, + "step": 4002, + "time_per_iteration": 2.714096784591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072085, + "balance_loss_mlp": 1.04480982, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05249171180112231, + "language_loss": 0.82409763, + "learning_rate": 0.00013231578107519916, + "loss": 0.83481848, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.27319336, + "step": 4003, + "time_per_iteration": 2.8834095001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073589, + "balance_loss_mlp": 1.04602814, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.06222122285204978, + "language_loss": 0.82945186, + "learning_rate": 0.00013210473035975422, + "loss": 0.84018773, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.27587891, + "step": 4004, + "time_per_iteration": 2.5676841735839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075243, + "balance_loss_mlp": 1.04756224, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.09382472586261968, + "language_loss": 0.85468185, + "learning_rate": 0.0001318938224739201, + "loss": 0.86543441, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.27734375, + "step": 4005, + "time_per_iteration": 3.032289743423462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072979, + "balance_loss_mlp": 1.04544115, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.05515917324758249, + "language_loss": 0.83841556, + "learning_rate": 0.00013168305749957843, + "loss": 0.84914535, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.27587891, + "step": 4006, + "time_per_iteration": 2.739898920059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074884, + "balance_loss_mlp": 1.04765701, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05387672734187661, + "language_loss": 0.8264026, + "learning_rate": 0.00013147243551855532, + "loss": 0.83715147, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.27270508, + "step": 4007, + "time_per_iteration": 2.5597212314605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071212, + "balance_loss_mlp": 1.04398441, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.05404718589625755, + "language_loss": 0.80486447, + "learning_rate": 0.00013126195661262148, + "loss": 0.81557661, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.27270508, + "step": 4008, + "time_per_iteration": 2.744112968444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.043365, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.04619128213129889, + "language_loss": 0.86330914, + "learning_rate": 0.00013105162086349216, + "loss": 0.87401342, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.27099609, + "step": 4009, + "time_per_iteration": 2.801823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.04526305, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.04727817553621032, + "language_loss": 0.86132288, + "learning_rate": 0.00013084142835282687, + "loss": 0.8720476, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.2722168, + "step": 4010, + "time_per_iteration": 2.6556901931762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020489, + "balance_loss_mlp": 1.00937891, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.005772893743499722, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80904853, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.11132812, + "step": 4011, + "time_per_iteration": 4.782922744750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04520464, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.05569724258515983, + "language_loss": 0.89507568, + "learning_rate": 0.0001304214733732485, + "loss": 0.90580624, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.27832031, + "step": 4012, + "time_per_iteration": 2.715064525604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.04758501, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.06797042537174566, + "language_loss": 0.82429183, + "learning_rate": 0.00013021171106737672, + "loss": 0.83504903, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.28125, + "step": 4013, + "time_per_iteration": 2.658712863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076283, + "balance_loss_mlp": 1.04867363, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05000868356723149, + "language_loss": 0.7937907, + "learning_rate": 0.00013000209232605071, + "loss": 0.80455357, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.27636719, + "step": 4014, + "time_per_iteration": 2.6712594032287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.04535961, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.05134661435861922, + "language_loss": 0.79622269, + "learning_rate": 0.0001297926172306519, + "loss": 0.80695289, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.27685547, + "step": 4015, + "time_per_iteration": 2.610283613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071717, + "balance_loss_mlp": 1.04420376, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.05687508890981391, + "language_loss": 0.78788078, + "learning_rate": 0.0001295832858625055, + "loss": 0.79859793, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.27539062, + "step": 4016, + "time_per_iteration": 3.2706351280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068156, + "balance_loss_mlp": 1.04054761, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.052610449581979135, + "language_loss": 0.69848269, + "learning_rate": 0.00012937409830288154, + "loss": 0.70916426, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.27636719, + "step": 4017, + "time_per_iteration": 2.8540306091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.04220808, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.0635987545876438, + "language_loss": 0.85103798, + "learning_rate": 0.00012916505463299362, + "loss": 0.86173213, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.27246094, + "step": 4018, + "time_per_iteration": 2.495150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070913, + "balance_loss_mlp": 1.0439713, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.05824058585066258, + "language_loss": 0.7791152, + "learning_rate": 0.00012895615493399972, + "loss": 0.78982437, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.26977539, + "step": 4019, + "time_per_iteration": 2.813354015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.04334593, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.0836786402257782, + "language_loss": 0.82400632, + "learning_rate": 0.00012874739928700192, + "loss": 0.83471084, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.27148438, + "step": 4020, + "time_per_iteration": 2.559576988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068211, + "balance_loss_mlp": 1.0395534, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.06159530150970634, + "language_loss": 0.79701376, + "learning_rate": 0.00012853878777304624, + "loss": 0.80769587, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.28662109, + "step": 4021, + "time_per_iteration": 2.8569796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072395, + "balance_loss_mlp": 1.04528701, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.052906319794948725, + "language_loss": 0.84479654, + "learning_rate": 0.000128330320473123, + "loss": 0.85552055, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.27172852, + "step": 4022, + "time_per_iteration": 2.715498208999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013154, + "balance_loss_mlp": 1.0020442, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.015943225392078396, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79345053, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.11132812, + "step": 4023, + "time_per_iteration": 4.888492107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073117, + "balance_loss_mlp": 1.04610443, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.06091537077025671, + "language_loss": 0.81350756, + "learning_rate": 0.0001279138188390543, + "loss": 0.82423878, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.27050781, + "step": 4024, + "time_per_iteration": 2.766850233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073924, + "balance_loss_mlp": 1.04622006, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05776515915351722, + "language_loss": 0.86359525, + "learning_rate": 0.00012770578466660915, + "loss": 0.87433445, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.27758789, + "step": 4025, + "time_per_iteration": 2.8906335830688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.04703164, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.05700523887714171, + "language_loss": 0.81593072, + "learning_rate": 0.0001274978950315968, + "loss": 0.82667857, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.27807617, + "step": 4026, + "time_per_iteration": 2.8301045894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.05058098, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.0689539575699981, + "language_loss": 0.82650018, + "learning_rate": 0.00012729015001472716, + "loss": 0.83727849, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.27258301, + "step": 4027, + "time_per_iteration": 2.6426851749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.04371142, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.05627311162483831, + "language_loss": 0.81452388, + "learning_rate": 0.00012708254969665418, + "loss": 0.82523495, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.27416992, + "step": 4028, + "time_per_iteration": 2.7853105068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107679, + "balance_loss_mlp": 1.04922891, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06575328123428556, + "language_loss": 0.83176428, + "learning_rate": 0.00012687509415797526, + "loss": 0.84253216, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.27587891, + "step": 4029, + "time_per_iteration": 2.5962271690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075768, + "balance_loss_mlp": 1.04808736, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.0626546531948414, + "language_loss": 0.81091148, + "learning_rate": 0.00012666778347923208, + "loss": 0.82166916, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.27709961, + "step": 4030, + "time_per_iteration": 2.647709369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078164, + "balance_loss_mlp": 1.04998243, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.044509265947171146, + "language_loss": 0.83753759, + "learning_rate": 0.0001264606177409092, + "loss": 0.84831923, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.28198242, + "step": 4031, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_mlp": 1.04437256, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.05920145784509139, + "language_loss": 0.85917544, + "learning_rate": 0.00012625359702343609, + "loss": 0.86988962, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.27075195, + "step": 4032, + "time_per_iteration": 2.7071335315704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107336, + "balance_loss_mlp": 1.04641843, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0993215607804505, + "language_loss": 0.84452856, + "learning_rate": 0.00012604672140718504, + "loss": 0.85526216, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.26965332, + "step": 4033, + "time_per_iteration": 2.6153743267059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075256, + "balance_loss_mlp": 1.04738498, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05917686409446163, + "language_loss": 0.77727896, + "learning_rate": 0.00012583999097247233, + "loss": 0.78803158, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.27905273, + "step": 4034, + "time_per_iteration": 2.876141309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.04836273, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.07262055787937163, + "language_loss": 0.80052263, + "learning_rate": 0.0001256334057995578, + "loss": 0.8112812, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.27514648, + "step": 4035, + "time_per_iteration": 2.7490179538726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072491, + "balance_loss_mlp": 1.04526329, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.050638434505961206, + "language_loss": 0.8468259, + "learning_rate": 0.000125426965968645, + "loss": 0.8575508, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.27294922, + "step": 4036, + "time_per_iteration": 2.7155818939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077213, + "balance_loss_mlp": 1.04967546, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.06589986489431957, + "language_loss": 0.82292032, + "learning_rate": 0.00012522067155988092, + "loss": 0.83369249, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.27563477, + "step": 4037, + "time_per_iteration": 2.712575912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072416, + "balance_loss_mlp": 1.0448314, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05822255331252486, + "language_loss": 0.75269878, + "learning_rate": 0.00012501452265335617, + "loss": 0.76342297, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.27612305, + "step": 4038, + "time_per_iteration": 2.8041534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04810321, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.05653078531335044, + "language_loss": 0.82581437, + "learning_rate": 0.0001248085193291047, + "loss": 0.83656931, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.27441406, + "step": 4039, + "time_per_iteration": 2.7838690280914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107999, + "balance_loss_mlp": 1.05230999, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.05606519790253506, + "language_loss": 0.82265162, + "learning_rate": 0.00012460266166710443, + "loss": 0.83345151, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.27734375, + "step": 4040, + "time_per_iteration": 3.1491823196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077311, + "balance_loss_mlp": 1.04989266, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.05703190402159479, + "language_loss": 0.77674973, + "learning_rate": 0.00012439694974727633, + "loss": 0.78752279, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.27441406, + "step": 4041, + "time_per_iteration": 3.0976173877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070951, + "balance_loss_mlp": 1.04358041, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.05364031630438029, + "language_loss": 0.80233228, + "learning_rate": 0.00012419138364948458, + "loss": 0.81304181, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.27392578, + "step": 4042, + "time_per_iteration": 2.7326791286468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070758, + "balance_loss_mlp": 1.04345894, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.0558907311125614, + "language_loss": 0.82470769, + "learning_rate": 0.00012398596345353702, + "loss": 0.83541524, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.2734375, + "step": 4043, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075612, + "balance_loss_mlp": 1.04824162, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.06132046127544376, + "language_loss": 0.83480489, + "learning_rate": 0.0001237806892391851, + "loss": 0.84556091, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.27416992, + "step": 4044, + "time_per_iteration": 2.7494754791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072105, + "balance_loss_mlp": 1.04540193, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05685464217024924, + "language_loss": 0.80689287, + "learning_rate": 0.0001235755610861233, + "loss": 0.81761396, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.26757812, + "step": 4045, + "time_per_iteration": 2.812063694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107437, + "balance_loss_mlp": 1.04711854, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.053935102157053175, + "language_loss": 0.85224533, + "learning_rate": 0.0001233705790739893, + "loss": 0.86298895, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.27270508, + "step": 4046, + "time_per_iteration": 2.7485461235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074247, + "balance_loss_mlp": 1.04697168, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.0673066847398555, + "language_loss": 0.74977076, + "learning_rate": 0.0001231657432823643, + "loss": 0.76051325, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.27319336, + "step": 4047, + "time_per_iteration": 3.1984071731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074697, + "balance_loss_mlp": 1.04661131, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.06151594222397662, + "language_loss": 0.78487623, + "learning_rate": 0.0001229610537907725, + "loss": 0.79562324, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.28100586, + "step": 4048, + "time_per_iteration": 2.6014962196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04379785, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.0760421254177005, + "language_loss": 0.90244645, + "learning_rate": 0.00012275651067868143, + "loss": 0.91317576, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.29077148, + "step": 4049, + "time_per_iteration": 2.598532199859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04196286, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.05867585212414032, + "language_loss": 0.80266809, + "learning_rate": 0.00012255211402550182, + "loss": 0.81336522, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.27807617, + "step": 4050, + "time_per_iteration": 3.223078727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070769, + "balance_loss_mlp": 1.04299307, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.07400928475776686, + "language_loss": 0.76817232, + "learning_rate": 0.00012234786391058727, + "loss": 0.77888, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.27783203, + "step": 4051, + "time_per_iteration": 4.367919683456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073785, + "balance_loss_mlp": 1.04565179, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.08184044182039507, + "language_loss": 0.84765863, + "learning_rate": 0.0001221437604132352, + "loss": 0.85839653, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.28149414, + "step": 4052, + "time_per_iteration": 2.619694948196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04369259, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.061094221003680546, + "language_loss": 0.81091797, + "learning_rate": 0.0001219398036126852, + "loss": 0.82162666, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.2722168, + "step": 4053, + "time_per_iteration": 2.7424631118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072046, + "balance_loss_mlp": 1.04391217, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.051190100857480304, + "language_loss": 0.77992457, + "learning_rate": 0.00012173599358812027, + "loss": 0.790645, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.28149414, + "step": 4054, + "time_per_iteration": 3.277557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070645, + "balance_loss_mlp": 1.04303575, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06092142653213725, + "language_loss": 0.82466495, + "learning_rate": 0.0001215323304186668, + "loss": 0.83537143, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.27587891, + "step": 4055, + "time_per_iteration": 2.7477025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074017, + "balance_loss_mlp": 1.0459547, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.06830093744875644, + "language_loss": 0.8764962, + "learning_rate": 0.00012132881418339364, + "loss": 0.88723636, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.28076172, + "step": 4056, + "time_per_iteration": 2.7418453693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009047, + "balance_loss_mlp": 0.99779409, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.016207473772952577, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7852661, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.11230469, + "step": 4057, + "time_per_iteration": 4.85454535484314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065459, + "balance_loss_mlp": 1.03661036, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.062259886670719244, + "language_loss": 0.77044684, + "learning_rate": 0.00012092222283137944, + "loss": 0.78110135, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.28833008, + "step": 4058, + "time_per_iteration": 2.764766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008333, + "balance_loss_mlp": 0.99707937, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.01618194632849119, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79914641, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.11230469, + "step": 4059, + "time_per_iteration": 4.825545310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069706, + "balance_loss_mlp": 1.0414772, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.07523837399490399, + "language_loss": 0.83462268, + "learning_rate": 0.00012051622016348856, + "loss": 0.84531975, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.2824707, + "step": 4060, + "time_per_iteration": 3.045809507369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_mlp": 1.04018903, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.06174241135408443, + "language_loss": 0.84242803, + "learning_rate": 0.00012031343978315539, + "loss": 0.85311675, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.28662109, + "step": 4061, + "time_per_iteration": 2.4845006465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_mlp": 1.04099798, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.1392477950837379, + "language_loss": 0.82486379, + "learning_rate": 0.00012011080681021774, + "loss": 0.83554912, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.27563477, + "step": 4062, + "time_per_iteration": 2.6524341106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04295421, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.07233679581194719, + "language_loss": 0.86375731, + "learning_rate": 0.00011990832132334512, + "loss": 0.87446582, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.27954102, + "step": 4063, + "time_per_iteration": 2.519162654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069214, + "balance_loss_mlp": 1.04112792, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.07068900898467687, + "language_loss": 0.82369703, + "learning_rate": 0.00011970598340114897, + "loss": 0.83438915, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.28100586, + "step": 4064, + "time_per_iteration": 2.9242045879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067385, + "balance_loss_mlp": 1.03875041, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.07366274029850052, + "language_loss": 0.83860916, + "learning_rate": 0.00011950379312218396, + "loss": 0.84928298, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.28637695, + "step": 4065, + "time_per_iteration": 2.7022647857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070383, + "balance_loss_mlp": 1.04191554, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.07812712198170087, + "language_loss": 0.86016601, + "learning_rate": 0.00011930175056494719, + "loss": 0.87086987, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.28466797, + "step": 4066, + "time_per_iteration": 2.885648488998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.04276156, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.0475815127648597, + "language_loss": 0.75548607, + "learning_rate": 0.00011909985580787885, + "loss": 0.76620239, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.28881836, + "step": 4067, + "time_per_iteration": 2.717013120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.0379895, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05385008636564137, + "language_loss": 0.80856502, + "learning_rate": 0.00011889810892936137, + "loss": 0.8192274, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.28295898, + "step": 4068, + "time_per_iteration": 2.7350502014160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.04105842, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.0661010913051719, + "language_loss": 0.77266741, + "learning_rate": 0.00011869651000771959, + "loss": 0.78335881, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.28100586, + "step": 4069, + "time_per_iteration": 2.8502442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065751, + "balance_loss_mlp": 1.03747416, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.06957531868653906, + "language_loss": 0.82841384, + "learning_rate": 0.00011849505912122117, + "loss": 0.83907133, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.28271484, + "step": 4070, + "time_per_iteration": 2.7242653369903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069163, + "balance_loss_mlp": 1.0401957, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.061542243963481506, + "language_loss": 0.77626544, + "learning_rate": 0.00011829375634807654, + "loss": 0.78695703, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.28955078, + "step": 4071, + "time_per_iteration": 3.18316650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03920245, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.06527363578820362, + "language_loss": 0.8108483, + "learning_rate": 0.00011809260176643821, + "loss": 0.82152736, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.28662109, + "step": 4072, + "time_per_iteration": 3.0564231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071131, + "balance_loss_mlp": 1.04318857, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.0688544484419534, + "language_loss": 0.83763361, + "learning_rate": 0.00011789159545440131, + "loss": 0.84834492, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.27978516, + "step": 4073, + "time_per_iteration": 2.6478123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070096, + "balance_loss_mlp": 1.04208159, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.05456504974378336, + "language_loss": 0.82081753, + "learning_rate": 0.00011769073749000348, + "loss": 0.83151847, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.2800293, + "step": 4074, + "time_per_iteration": 2.7911314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069906, + "balance_loss_mlp": 1.041749, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.07358433801147621, + "language_loss": 0.76115894, + "learning_rate": 0.0001174900279512246, + "loss": 0.77185798, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.28149414, + "step": 4075, + "time_per_iteration": 2.593980312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070027, + "balance_loss_mlp": 1.04110718, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.055342987139179775, + "language_loss": 0.81843507, + "learning_rate": 0.00011728946691598707, + "loss": 0.82913536, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2890625, + "step": 4076, + "time_per_iteration": 2.6213133335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067587, + "balance_loss_mlp": 1.03902483, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06016705026128457, + "language_loss": 0.76231396, + "learning_rate": 0.00011708905446215561, + "loss": 0.77298987, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.28540039, + "step": 4077, + "time_per_iteration": 2.89338755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04110491, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.052498050136505506, + "language_loss": 0.80255234, + "learning_rate": 0.00011688879066753711, + "loss": 0.81324947, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.28564453, + "step": 4078, + "time_per_iteration": 2.691178560256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.04007649, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.06922222458803326, + "language_loss": 0.87530267, + "learning_rate": 0.00011668867560988122, + "loss": 0.88597786, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.2746582, + "step": 4079, + "time_per_iteration": 2.5730109214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067247, + "balance_loss_mlp": 1.03870857, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.07036419305284744, + "language_loss": 0.84369481, + "learning_rate": 0.00011648870936687916, + "loss": 0.85436726, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.28540039, + "step": 4080, + "time_per_iteration": 2.763648271560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069029, + "balance_loss_mlp": 1.04053807, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07246870648451295, + "language_loss": 0.78439957, + "learning_rate": 0.00011628889201616461, + "loss": 0.79508984, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.28515625, + "step": 4081, + "time_per_iteration": 2.6238608360290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070508, + "balance_loss_mlp": 1.04208827, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.05558757362509338, + "language_loss": 0.81841099, + "learning_rate": 0.00011608922363531393, + "loss": 0.82911611, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.28417969, + "step": 4082, + "time_per_iteration": 2.6667022705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.04639971, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.07344619623899691, + "language_loss": 0.83384395, + "learning_rate": 0.00011588970430184504, + "loss": 0.84458899, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.28100586, + "step": 4083, + "time_per_iteration": 3.0444436073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069927, + "balance_loss_mlp": 1.04212761, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.045313213286836455, + "language_loss": 0.81620705, + "learning_rate": 0.00011569033409321822, + "loss": 0.82690632, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.27807617, + "step": 4084, + "time_per_iteration": 2.7107021808624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074024, + "balance_loss_mlp": 1.04605722, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.06179602249028764, + "language_loss": 0.73075098, + "learning_rate": 0.00011549111308683591, + "loss": 0.7414912, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.2800293, + "step": 4085, + "time_per_iteration": 2.674802780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04991984, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.06384285931580107, + "language_loss": 0.80674589, + "learning_rate": 0.00011529204136004251, + "loss": 0.8175236, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.27905273, + "step": 4086, + "time_per_iteration": 2.485450029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073353, + "balance_loss_mlp": 1.04600596, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.056474664391545235, + "language_loss": 0.84569514, + "learning_rate": 0.00011509311899012459, + "loss": 0.85642868, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.27392578, + "step": 4087, + "time_per_iteration": 2.6641156673431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072601, + "balance_loss_mlp": 1.04475415, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.09344860836240211, + "language_loss": 0.78010523, + "learning_rate": 0.00011489434605431053, + "loss": 0.79083121, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.27880859, + "step": 4088, + "time_per_iteration": 2.646610736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071704, + "balance_loss_mlp": 1.04390407, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.06168893422677419, + "language_loss": 0.81236577, + "learning_rate": 0.0001146957226297708, + "loss": 0.8230828, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.27807617, + "step": 4089, + "time_per_iteration": 2.7216711044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106999, + "balance_loss_mlp": 1.04147482, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.05015677705021027, + "language_loss": 0.76367462, + "learning_rate": 0.00011449724879361827, + "loss": 0.77437449, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.28515625, + "step": 4090, + "time_per_iteration": 2.9962027072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070207, + "balance_loss_mlp": 1.04212117, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.07758144969638558, + "language_loss": 0.73733866, + "learning_rate": 0.00011429892462290687, + "loss": 0.74804068, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.28100586, + "step": 4091, + "time_per_iteration": 2.7208704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071413, + "balance_loss_mlp": 1.04413819, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05584477685741542, + "language_loss": 0.83089757, + "learning_rate": 0.00011410075019463295, + "loss": 0.84161168, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.27319336, + "step": 4092, + "time_per_iteration": 2.608442544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04168272, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.05394381148222231, + "language_loss": 0.79899406, + "learning_rate": 0.00011390272558573461, + "loss": 0.80969799, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.28710938, + "step": 4093, + "time_per_iteration": 2.6670477390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070092, + "balance_loss_mlp": 1.04183984, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.04973668631858953, + "language_loss": 0.79517233, + "learning_rate": 0.00011370485087309202, + "loss": 0.80587327, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.2824707, + "step": 4094, + "time_per_iteration": 2.651747703552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107091, + "balance_loss_mlp": 1.04229987, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.05872791575225344, + "language_loss": 0.78693342, + "learning_rate": 0.00011350712613352688, + "loss": 0.79764247, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.28613281, + "step": 4095, + "time_per_iteration": 2.6549277305603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_mlp": 1.04072237, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.07961293490995022, + "language_loss": 0.79440165, + "learning_rate": 0.00011330955144380283, + "loss": 0.80509305, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.28417969, + "step": 4096, + "time_per_iteration": 2.6206085681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_mlp": 1.04217863, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.06633225025055933, + "language_loss": 0.86351848, + "learning_rate": 0.00011311212688062483, + "loss": 0.87421972, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.27929688, + "step": 4097, + "time_per_iteration": 2.781184673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069633, + "balance_loss_mlp": 1.0408082, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.07192838384326647, + "language_loss": 0.77839339, + "learning_rate": 0.0001129148525206402, + "loss": 0.78908968, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.28808594, + "step": 4098, + "time_per_iteration": 2.8173389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067449, + "balance_loss_mlp": 1.03931606, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.11237603320949716, + "language_loss": 0.86339819, + "learning_rate": 0.00011271772844043759, + "loss": 0.87407273, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.28125, + "step": 4099, + "time_per_iteration": 2.7524821758270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069791, + "balance_loss_mlp": 1.04127622, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.06946640589316219, + "language_loss": 0.75986981, + "learning_rate": 0.00011252075471654727, + "loss": 0.77056766, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.28515625, + "step": 4100, + "time_per_iteration": 2.947204351425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071355, + "balance_loss_mlp": 1.04262543, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.05611482280761958, + "language_loss": 0.7798807, + "learning_rate": 0.00011232393142544133, + "loss": 0.79059422, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.28710938, + "step": 4101, + "time_per_iteration": 2.95438551902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068821, + "balance_loss_mlp": 1.04037809, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.06028554523946094, + "language_loss": 0.83136284, + "learning_rate": 0.00011212725864353323, + "loss": 0.84205109, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.28417969, + "step": 4102, + "time_per_iteration": 3.067315101623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015622, + "balance_loss_mlp": 1.00370073, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.009770361918426226, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77351552, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.11914062, + "step": 4103, + "time_per_iteration": 4.903147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04016232, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06690395183564687, + "language_loss": 0.75603718, + "learning_rate": 0.00011173436491267291, + "loss": 0.76673061, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.29150391, + "step": 4104, + "time_per_iteration": 2.607632637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064374, + "balance_loss_mlp": 1.0360018, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.055969758992029287, + "language_loss": 0.81935525, + "learning_rate": 0.0001115381441162554, + "loss": 0.82999897, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.28393555, + "step": 4105, + "time_per_iteration": 2.6217761039733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014508, + "balance_loss_mlp": 1.00268257, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.0095479570502747, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74598229, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.11816406, + "step": 4106, + "time_per_iteration": 4.9060986042022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063636, + "balance_loss_mlp": 1.03524053, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.04917500811755106, + "language_loss": 0.84986818, + "learning_rate": 0.00011114615504234465, + "loss": 0.86050451, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.28393555, + "step": 4107, + "time_per_iteration": 2.760727882385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_mlp": 1.03931451, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.062643238447281, + "language_loss": 0.81024301, + "learning_rate": 0.00011095038691703468, + "loss": 0.82092702, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.29077148, + "step": 4108, + "time_per_iteration": 2.8416430950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_mlp": 1.03758597, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.059690498019966905, + "language_loss": 0.824301, + "learning_rate": 0.00011075476983417998, + "loss": 0.83495629, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.27978516, + "step": 4109, + "time_per_iteration": 2.879164695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.03742075, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.06625307097230863, + "language_loss": 0.77845091, + "learning_rate": 0.00011055930386972579, + "loss": 0.78911859, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.29272461, + "step": 4110, + "time_per_iteration": 2.8940486907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010668, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.05640022184839657, + "language_loss": 0.78389466, + "learning_rate": 0.00011036398909955863, + "loss": 0.79456264, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.29150391, + "step": 4111, + "time_per_iteration": 2.9704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03795147, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.05533152430131226, + "language_loss": 0.81315625, + "learning_rate": 0.00011016882559950648, + "loss": 0.82381761, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.28173828, + "step": 4112, + "time_per_iteration": 2.8546900749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064394, + "balance_loss_mlp": 1.03561699, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.06990273723133285, + "language_loss": 0.80328232, + "learning_rate": 0.00010997381344533853, + "loss": 0.81392628, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.28759766, + "step": 4113, + "time_per_iteration": 2.7969515323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069119, + "balance_loss_mlp": 1.04031801, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.061948681643476444, + "language_loss": 0.80212009, + "learning_rate": 0.00010977895271276517, + "loss": 0.81281132, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.28808594, + "step": 4114, + "time_per_iteration": 2.7396297454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064232, + "balance_loss_mlp": 1.03552604, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.06188955891536592, + "language_loss": 0.80402255, + "learning_rate": 0.00010958424347743807, + "loss": 0.8146649, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.28710938, + "step": 4115, + "time_per_iteration": 2.7420108318328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071293, + "balance_loss_mlp": 1.04337442, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.07461075198544243, + "language_loss": 0.80391407, + "learning_rate": 0.00010938968581494991, + "loss": 0.81462699, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.27929688, + "step": 4116, + "time_per_iteration": 2.941556692123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072505, + "balance_loss_mlp": 1.04418087, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.12071106309265658, + "language_loss": 0.78737396, + "learning_rate": 0.000109195279800835, + "loss": 0.79809904, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.28344727, + "step": 4117, + "time_per_iteration": 2.7312655448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.03901899, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.06211546650741466, + "language_loss": 0.76734632, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802026, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.28344727, + "step": 4118, + "time_per_iteration": 3.061748504638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.04590917, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05658815463494319, + "language_loss": 0.84763014, + "learning_rate": 0.00010880692301956601, + "loss": 0.85836887, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.27978516, + "step": 4119, + "time_per_iteration": 2.504396677017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.04241323, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.052435339334051444, + "language_loss": 0.85989153, + "learning_rate": 0.00010861297240318518, + "loss": 0.87059963, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.28393555, + "step": 4120, + "time_per_iteration": 2.851905584335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_mlp": 1.04296827, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.06531293240023527, + "language_loss": 0.86884111, + "learning_rate": 0.00010841917373672444, + "loss": 0.87954831, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.27783203, + "step": 4121, + "time_per_iteration": 2.72057843208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04561055, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.0659209843425975, + "language_loss": 0.78515911, + "learning_rate": 0.00010822552709542293, + "loss": 0.7959013, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.28588867, + "step": 4122, + "time_per_iteration": 2.8345208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068379, + "balance_loss_mlp": 1.04067445, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.053977644004353675, + "language_loss": 0.86079139, + "learning_rate": 0.0001080320325544612, + "loss": 0.87147516, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.27734375, + "step": 4123, + "time_per_iteration": 2.734748601913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073545, + "balance_loss_mlp": 1.04591262, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.05342076952837262, + "language_loss": 0.82945108, + "learning_rate": 0.00010783869018895997, + "loss": 0.84018654, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.27661133, + "step": 4124, + "time_per_iteration": 2.5848159790039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071922, + "balance_loss_mlp": 1.04438472, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05760976665940277, + "language_loss": 0.84397703, + "learning_rate": 0.00010764550007398189, + "loss": 0.85469627, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.27563477, + "step": 4125, + "time_per_iteration": 2.613123655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076104, + "balance_loss_mlp": 1.04797053, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.05267738869669298, + "language_loss": 0.81016707, + "learning_rate": 0.00010745246228452982, + "loss": 0.82092816, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.28173828, + "step": 4126, + "time_per_iteration": 2.5770304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.04460263, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.053184738741740976, + "language_loss": 0.8170619, + "learning_rate": 0.00010725957689554771, + "loss": 0.82778513, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.27734375, + "step": 4127, + "time_per_iteration": 2.774044990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073736, + "balance_loss_mlp": 1.04579329, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.047011204892956564, + "language_loss": 0.84647489, + "learning_rate": 0.00010706684398192013, + "loss": 0.85721219, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.27978516, + "step": 4128, + "time_per_iteration": 2.74668025970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070127, + "balance_loss_mlp": 1.0423516, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.061789852182866596, + "language_loss": 0.82038182, + "learning_rate": 0.00010687426361847313, + "loss": 0.83108312, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.27807617, + "step": 4129, + "time_per_iteration": 2.7684710025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075571, + "balance_loss_mlp": 1.04777122, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.056918102150188964, + "language_loss": 0.85627353, + "learning_rate": 0.00010668183587997254, + "loss": 0.86702919, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.27807617, + "step": 4130, + "time_per_iteration": 2.6196768283843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071124, + "balance_loss_mlp": 1.04289508, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.052989144266830976, + "language_loss": 0.77423567, + "learning_rate": 0.0001064895608411256, + "loss": 0.78494692, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.28222656, + "step": 4131, + "time_per_iteration": 2.822084903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.04275465, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.05398038812171178, + "language_loss": 0.80283594, + "learning_rate": 0.00010629743857657998, + "loss": 0.81354314, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.27954102, + "step": 4132, + "time_per_iteration": 2.9548959732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018993, + "balance_loss_mlp": 1.00807393, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.012201686903541073, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71617663, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.109375, + "step": 4133, + "time_per_iteration": 4.596825122833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077524, + "balance_loss_mlp": 1.04950953, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.1291273106507343, + "language_loss": 0.82121062, + "learning_rate": 0.00010591365266868802, + "loss": 0.83198583, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.28027344, + "step": 4134, + "time_per_iteration": 2.997457981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019784, + "balance_loss_mlp": 1.00886476, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.01121858900173578, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76531565, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.109375, + "step": 4135, + "time_per_iteration": 4.933257818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.0421412, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.07786925051397248, + "language_loss": 0.78780544, + "learning_rate": 0.00010553047875229166, + "loss": 0.7985087, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.28198242, + "step": 4136, + "time_per_iteration": 2.5145680904388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072473, + "balance_loss_mlp": 1.04522216, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.08712242528713769, + "language_loss": 0.83510804, + "learning_rate": 0.00010533912147689328, + "loss": 0.84583282, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.27270508, + "step": 4137, + "time_per_iteration": 2.6298136711120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_mlp": 1.04814243, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.06714788693393858, + "language_loss": 0.82280171, + "learning_rate": 0.00010514791742243656, + "loss": 0.83356392, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.28100586, + "step": 4138, + "time_per_iteration": 2.5997424125671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073026, + "balance_loss_mlp": 1.04553676, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.06696972519058896, + "language_loss": 0.82444674, + "learning_rate": 0.00010495686666315341, + "loss": 0.83517706, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.27514648, + "step": 4139, + "time_per_iteration": 2.8953542709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074691, + "balance_loss_mlp": 1.04662871, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.07236671578874358, + "language_loss": 0.77130395, + "learning_rate": 0.00010476596927321635, + "loss": 0.78205085, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.28076172, + "step": 4140, + "time_per_iteration": 2.6313490867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04591274, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.07734927138109192, + "language_loss": 0.80230534, + "learning_rate": 0.00010457522532673835, + "loss": 0.81304598, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.28173828, + "step": 4141, + "time_per_iteration": 2.8211119174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.0459199, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.05569229872202348, + "language_loss": 0.83232534, + "learning_rate": 0.00010438463489777272, + "loss": 0.84306371, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.27954102, + "step": 4142, + "time_per_iteration": 2.6115970611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074531, + "balance_loss_mlp": 1.04665971, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06331690376736109, + "language_loss": 0.77703011, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877754, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.27880859, + "step": 4143, + "time_per_iteration": 2.7046220302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074634, + "balance_loss_mlp": 1.04664397, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.04909390704775502, + "language_loss": 0.83792174, + "learning_rate": 0.00010400391488829403, + "loss": 0.8486681, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.2800293, + "step": 4144, + "time_per_iteration": 2.790830612182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.04788637, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.05483263194538034, + "language_loss": 0.86199546, + "learning_rate": 0.00010381378545558984, + "loss": 0.87275642, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.2824707, + "step": 4145, + "time_per_iteration": 2.7284913063049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069763, + "balance_loss_mlp": 1.04203475, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.05322555202635646, + "language_loss": 0.84398592, + "learning_rate": 0.00010362380983601505, + "loss": 0.85468352, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.27758789, + "step": 4146, + "time_per_iteration": 2.546143054962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03938699, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.05187096482218071, + "language_loss": 0.78898019, + "learning_rate": 0.00010343398810332477, + "loss": 0.79965299, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.27905273, + "step": 4147, + "time_per_iteration": 3.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04465318, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.0650162065800976, + "language_loss": 0.84200764, + "learning_rate": 0.00010324432033121467, + "loss": 0.85273361, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2800293, + "step": 4148, + "time_per_iteration": 2.9164648056030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04207134, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.06518493190513895, + "language_loss": 0.83341253, + "learning_rate": 0.00010305480659332005, + "loss": 0.84412122, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.28808594, + "step": 4149, + "time_per_iteration": 2.6469006538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.04290879, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06242001263980543, + "language_loss": 0.83330691, + "learning_rate": 0.00010286544696321682, + "loss": 0.84401828, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.28222656, + "step": 4150, + "time_per_iteration": 2.5429742336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.04543519, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.06754113423442079, + "language_loss": 0.79446447, + "learning_rate": 0.00010267624151442073, + "loss": 0.80520344, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.28417969, + "step": 4151, + "time_per_iteration": 2.6111056804656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107675, + "balance_loss_mlp": 1.04852068, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.0631421524171095, + "language_loss": 0.80901897, + "learning_rate": 0.000102487190320388, + "loss": 0.81978643, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.2824707, + "step": 4152, + "time_per_iteration": 3.323118209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068338, + "balance_loss_mlp": 1.04015708, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.0589010586848655, + "language_loss": 0.79593813, + "learning_rate": 0.00010229829345451475, + "loss": 0.80662155, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.28198242, + "step": 4153, + "time_per_iteration": 3.364107370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071047, + "balance_loss_mlp": 1.04329467, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.06516359919102382, + "language_loss": 0.79660934, + "learning_rate": 0.00010210955099013724, + "loss": 0.80731982, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.27758789, + "step": 4154, + "time_per_iteration": 3.413896322250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070605, + "balance_loss_mlp": 1.04247141, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06322395894070157, + "language_loss": 0.76450896, + "learning_rate": 0.00010192096300053167, + "loss": 0.77521503, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.28149414, + "step": 4155, + "time_per_iteration": 3.1282687187194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069737, + "balance_loss_mlp": 1.04179418, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.4084707213419165, + "language_loss": 0.8520155, + "learning_rate": 0.00010173252955891477, + "loss": 0.8627128, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.27929688, + "step": 4156, + "time_per_iteration": 2.78415584564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074464, + "balance_loss_mlp": 1.04685545, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.06643949206963136, + "language_loss": 0.72880185, + "learning_rate": 0.00010154425073844253, + "loss": 0.73954648, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.27612305, + "step": 4157, + "time_per_iteration": 2.73618221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068843, + "balance_loss_mlp": 1.04032815, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.05290023006148714, + "language_loss": 0.82135558, + "learning_rate": 0.00010135612661221138, + "loss": 0.83204401, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.28515625, + "step": 4158, + "time_per_iteration": 2.554800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068433, + "balance_loss_mlp": 1.04008496, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.060322834717302515, + "language_loss": 0.81768221, + "learning_rate": 0.00010116815725325751, + "loss": 0.82836652, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.28344727, + "step": 4159, + "time_per_iteration": 3.2874691486358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077912, + "balance_loss_mlp": 1.04949212, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.0534649619029418, + "language_loss": 0.80202901, + "learning_rate": 0.00010098034273455725, + "loss": 0.8128081, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.28417969, + "step": 4160, + "time_per_iteration": 2.9733405113220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071823, + "balance_loss_mlp": 1.04402316, + "epoch": 0.8005001923816852, + "flos": 488201831424.0, + "grad_norm": 0.059729691811872904, + "language_loss": 0.79879338, + "learning_rate": 0.00010079268312902662, + "loss": 0.80951154, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.27832031, + "step": 4161, + "time_per_iteration": 2.668815851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075695, + "balance_loss_mlp": 1.04875386, + "epoch": 0.8006925740669488, + "flos": 512983107072.0, + "grad_norm": 0.06045129484574589, + "language_loss": 0.81970763, + "learning_rate": 0.0001006051785095215, + "loss": 0.8304646, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.26977539, + "step": 4162, + "time_per_iteration": 2.653640031814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.03988147, + "epoch": 0.8008849557522124, + "flos": 578243446272.0, + "grad_norm": 0.0602464092340954, + "language_loss": 0.79306024, + "learning_rate": 0.0001004178289488376, + "loss": 0.80373633, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.27783203, + "step": 4163, + "time_per_iteration": 2.732161283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.04183149, + "epoch": 0.801077337437476, + "flos": 478466569728.0, + "grad_norm": 0.05584875383121944, + "language_loss": 0.83879602, + "learning_rate": 0.0001002306345197106, + "loss": 0.84949404, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.2800293, + "step": 4164, + "time_per_iteration": 2.541621685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072063, + "balance_loss_mlp": 1.04419172, + "epoch": 0.8012697191227395, + "flos": 676384943616.0, + "grad_norm": 0.06393234311197828, + "language_loss": 0.79935479, + "learning_rate": 0.00010004359529481571, + "loss": 0.8100754, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.27880859, + "step": 4165, + "time_per_iteration": 2.879521369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04808736, + "epoch": 0.8014621008080031, + "flos": 1294624567296.0, + "grad_norm": 0.05702716092084167, + "language_loss": 0.82164598, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83240986, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.28320312, + "step": 4166, + "time_per_iteration": 3.7128407955169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074533, + "balance_loss_mlp": 1.04687643, + "epoch": 0.8016544824932667, + "flos": 511579072512.0, + "grad_norm": 0.07676481935953286, + "language_loss": 0.82921106, + "learning_rate": 9.966998274812234e-05, + "loss": 0.8399564, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.27685547, + "step": 4167, + "time_per_iteration": 2.6149368286132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074371, + "balance_loss_mlp": 1.04676175, + "epoch": 0.8018468641785302, + "flos": 535434969600.0, + "grad_norm": 0.07175891193928671, + "language_loss": 0.8114351, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82217884, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.27636719, + "step": 4168, + "time_per_iteration": 2.6559274196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079124, + "balance_loss_mlp": 1.05177772, + "epoch": 0.8020392458637937, + "flos": 1023025876992.0, + "grad_norm": 0.0825865132585856, + "language_loss": 0.7948184, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80560958, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.27416992, + "step": 4169, + "time_per_iteration": 3.292508363723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_mlp": 1.02546716, + "epoch": 0.8022316275490573, + "flos": 1560993748992.0, + "grad_norm": 0.019591021786405507, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79091221, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.11328125, + "step": 4170, + "time_per_iteration": 4.99972677230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079157, + "balance_loss_mlp": 1.05030823, + "epoch": 0.8024240092343209, + "flos": 420473866752.0, + "grad_norm": 0.06556949465152317, + "language_loss": 0.83036101, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84115261, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.28833008, + "step": 4171, + "time_per_iteration": 2.4891555309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077075, + "balance_loss_mlp": 1.04839337, + "epoch": 0.8026163909195845, + "flos": 763506418176.0, + "grad_norm": 0.06284126709301016, + "language_loss": 0.78710306, + "learning_rate": 9.873867253111762e-05, + "loss": 0.7978738, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.28686523, + "step": 4172, + "time_per_iteration": 2.9779157638549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_mlp": 1.02285993, + "epoch": 0.8028087726048481, + "flos": 1518044087808.0, + "grad_norm": 0.018943841707913467, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81298721, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.11279297, + "step": 4173, + "time_per_iteration": 4.92714524269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079527, + "balance_loss_mlp": 1.051108, + "epoch": 0.8030011542901115, + "flos": 517620174336.0, + "grad_norm": 0.07028962600154551, + "language_loss": 0.8832283, + "learning_rate": 9.836723842278733e-05, + "loss": 0.8940236, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.28417969, + "step": 4174, + "time_per_iteration": 2.6065914630889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_mlp": 1.04717231, + "epoch": 0.8031935359753751, + "flos": 545356495872.0, + "grad_norm": 0.06309539904613753, + "language_loss": 0.7796675, + "learning_rate": 9.818175522408646e-05, + "loss": 0.7904191, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.27966309, + "step": 4175, + "time_per_iteration": 2.6612541675567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075178, + "balance_loss_mlp": 1.04694939, + "epoch": 0.8033859176606387, + "flos": 603266241024.0, + "grad_norm": 0.047657193754151006, + "language_loss": 0.84480703, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85555875, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.28222656, + "step": 4176, + "time_per_iteration": 2.7781217098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_mlp": 1.0484159, + "epoch": 0.8035782993459023, + "flos": 565579279872.0, + "grad_norm": 0.07387261504337528, + "language_loss": 0.81488836, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82565117, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.27905273, + "step": 4177, + "time_per_iteration": 2.6964521408081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073463, + "balance_loss_mlp": 1.04540133, + "epoch": 0.8037706810311658, + "flos": 538177609728.0, + "grad_norm": 0.057863226460369684, + "language_loss": 0.84295249, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85368717, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.28051758, + "step": 4178, + "time_per_iteration": 2.618422269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070523, + "balance_loss_mlp": 1.04231787, + "epoch": 0.8039630627164294, + "flos": 514937170944.0, + "grad_norm": 0.05803558735521543, + "language_loss": 0.79554057, + "learning_rate": 9.744138314526014e-05, + "loss": 0.8062458, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.28222656, + "step": 4179, + "time_per_iteration": 2.637068510055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023515, + "balance_loss_mlp": 1.01240516, + "epoch": 0.804155444401693, + "flos": 1478061209088.0, + "grad_norm": 0.008294306940635323, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.7575708, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.11132812, + "step": 4180, + "time_per_iteration": 4.895167827606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070457, + "balance_loss_mlp": 1.04163229, + "epoch": 0.8043478260869565, + "flos": 520909871616.0, + "grad_norm": 0.06869839727522731, + "language_loss": 0.7746588, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78536338, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.2878418, + "step": 4181, + "time_per_iteration": 2.636059045791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071714, + "balance_loss_mlp": 1.04296076, + "epoch": 0.8045402077722201, + "flos": 545170231296.0, + "grad_norm": 0.061080671459635506, + "language_loss": 0.80578196, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81649911, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.28710938, + "step": 4182, + "time_per_iteration": 2.781472682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073737, + "balance_loss_mlp": 1.04460168, + "epoch": 0.8047325894574836, + "flos": 678059610624.0, + "grad_norm": 0.06915536366998667, + "language_loss": 0.73871112, + "learning_rate": 9.670351165906921e-05, + "loss": 0.74944854, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.29150391, + "step": 4183, + "time_per_iteration": 2.9433372020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069336, + "balance_loss_mlp": 1.04103541, + "epoch": 0.8049249711427472, + "flos": 586952994816.0, + "grad_norm": 0.057442229187810216, + "language_loss": 0.78591096, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79660439, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.28320312, + "step": 4184, + "time_per_iteration": 2.753244400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.03734207, + "epoch": 0.8051173528280108, + "flos": 813824050176.0, + "grad_norm": 0.055690130588938895, + "language_loss": 0.77644128, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78709412, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.27978516, + "step": 4185, + "time_per_iteration": 3.116245746612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071248, + "balance_loss_mlp": 1.04249442, + "epoch": 0.8053097345132744, + "flos": 725371324416.0, + "grad_norm": 0.05368141398175553, + "language_loss": 0.77715063, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78786314, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.28735352, + "step": 4186, + "time_per_iteration": 2.9494264125823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067801, + "balance_loss_mlp": 1.03900027, + "epoch": 0.805502116198538, + "flos": 747706733568.0, + "grad_norm": 0.07244263091625658, + "language_loss": 0.81652725, + "learning_rate": 9.596814534898552e-05, + "loss": 0.8272053, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.2878418, + "step": 4187, + "time_per_iteration": 2.979442596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061733, + "balance_loss_mlp": 1.03312325, + "epoch": 0.8056944978838014, + "flos": 639953630208.0, + "grad_norm": 0.06907450450610357, + "language_loss": 0.87470937, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88532674, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.28637695, + "step": 4188, + "time_per_iteration": 2.804840564727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069501, + "balance_loss_mlp": 1.04017591, + "epoch": 0.805886879569065, + "flos": 644344796160.0, + "grad_norm": 0.0767121628935675, + "language_loss": 0.78102624, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79172122, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.29296875, + "step": 4189, + "time_per_iteration": 2.763796329498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070889, + "balance_loss_mlp": 1.04175389, + "epoch": 0.8060792612543286, + "flos": 660928674816.0, + "grad_norm": 0.06679116415647134, + "language_loss": 0.81191343, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8226223, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.29125977, + "step": 4190, + "time_per_iteration": 2.810873031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106476, + "balance_loss_mlp": 1.03603029, + "epoch": 0.8062716429395922, + "flos": 454842017280.0, + "grad_norm": 0.06652333597663049, + "language_loss": 0.8252098, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83585739, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.28735352, + "step": 4191, + "time_per_iteration": 2.5331108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.04260826, + "epoch": 0.8064640246248557, + "flos": 526153632768.0, + "grad_norm": 0.07127869186789165, + "language_loss": 0.85161996, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86233693, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.29052734, + "step": 4192, + "time_per_iteration": 2.652766466140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066547, + "balance_loss_mlp": 1.03776956, + "epoch": 0.8066564063101193, + "flos": 864726617088.0, + "grad_norm": 0.048096998874408305, + "language_loss": 0.82061756, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83128297, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.28735352, + "step": 4193, + "time_per_iteration": 3.1492722034454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064872, + "balance_loss_mlp": 1.03559446, + "epoch": 0.8068487879953828, + "flos": 530261019648.0, + "grad_norm": 0.05222546458111616, + "language_loss": 0.8172397, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82788843, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.29272461, + "step": 4194, + "time_per_iteration": 2.7544384002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_mlp": 1.0379194, + "epoch": 0.8070411696806464, + "flos": 565918313472.0, + "grad_norm": 0.04982276198281567, + "language_loss": 0.81616491, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82683873, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.29443359, + "step": 4195, + "time_per_iteration": 2.707900285720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.03467226, + "epoch": 0.80723355136591, + "flos": 986176954368.0, + "grad_norm": 0.04761294147814613, + "language_loss": 0.79018849, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80082393, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.28857422, + "step": 4196, + "time_per_iteration": 3.3168561458587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066531, + "balance_loss_mlp": 1.0380404, + "epoch": 0.8074259330511735, + "flos": 566682158592.0, + "grad_norm": 0.05760757559429525, + "language_loss": 0.82881331, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83947861, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.28491211, + "step": 4197, + "time_per_iteration": 2.8094139099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.03605068, + "epoch": 0.8076183147364371, + "flos": 494391320064.0, + "grad_norm": 0.05415863808022291, + "language_loss": 0.79741108, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80806035, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.28881836, + "step": 4198, + "time_per_iteration": 2.7344775199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065792, + "balance_loss_mlp": 1.03694367, + "epoch": 0.8078106964217007, + "flos": 419798771712.0, + "grad_norm": 0.0525166714503648, + "language_loss": 0.79778922, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80844712, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.28808594, + "step": 4199, + "time_per_iteration": 2.481445550918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065581, + "balance_loss_mlp": 1.03642273, + "epoch": 0.8080030781069643, + "flos": 527281242624.0, + "grad_norm": 0.12008878488060483, + "language_loss": 0.82967323, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84032905, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.29125977, + "step": 4200, + "time_per_iteration": 2.603832960128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_mlp": 1.04081631, + "epoch": 0.8081954597922277, + "flos": 543898616832.0, + "grad_norm": 0.05544324871835158, + "language_loss": 0.81466305, + "learning_rate": 9.341415733398733e-05, + "loss": 0.8253628, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.29150391, + "step": 4201, + "time_per_iteration": 2.6372344493865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066578, + "balance_loss_mlp": 1.03768134, + "epoch": 0.8083878414774913, + "flos": 640593819648.0, + "grad_norm": 0.06923511846840386, + "language_loss": 0.75673985, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76740557, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.28857422, + "step": 4202, + "time_per_iteration": 2.844560384750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065127, + "balance_loss_mlp": 1.03606391, + "epoch": 0.8085802231627549, + "flos": 705292545024.0, + "grad_norm": 0.06954281652768038, + "language_loss": 0.72733068, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73798198, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.2902832, + "step": 4203, + "time_per_iteration": 2.8883166313171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_mlp": 1.0347656, + "epoch": 0.8087726048480185, + "flos": 419538313728.0, + "grad_norm": 0.06243903224540148, + "language_loss": 0.88454056, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89518553, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.296875, + "step": 4204, + "time_per_iteration": 2.5257723331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067541, + "balance_loss_mlp": 1.03912127, + "epoch": 0.8089649865332821, + "flos": 508493016576.0, + "grad_norm": 0.055666133519853146, + "language_loss": 0.87257159, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88324702, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.28417969, + "step": 4205, + "time_per_iteration": 2.764925956726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.03842449, + "epoch": 0.8091573682185456, + "flos": 456960434688.0, + "grad_norm": 0.058789121979447925, + "language_loss": 0.84584945, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85652483, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.29077148, + "step": 4206, + "time_per_iteration": 2.687206506729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03734708, + "epoch": 0.8093497499038091, + "flos": 524977970688.0, + "grad_norm": 0.05245178609019049, + "language_loss": 0.76937735, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78003788, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.28686523, + "step": 4207, + "time_per_iteration": 2.723975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068651, + "balance_loss_mlp": 1.04037452, + "epoch": 0.8095421315890727, + "flos": 489377493504.0, + "grad_norm": 0.07111499051035935, + "language_loss": 0.76618123, + "learning_rate": 9.214875321953164e-05, + "loss": 0.77686775, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.28271484, + "step": 4208, + "time_per_iteration": 2.615595817565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_mlp": 1.04001641, + "epoch": 0.8097345132743363, + "flos": 624817456128.0, + "grad_norm": 0.05731599003072511, + "language_loss": 0.8059206, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81661069, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.28930664, + "step": 4209, + "time_per_iteration": 2.8043084144592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.0376935, + "epoch": 0.8099268949595998, + "flos": 615393524736.0, + "grad_norm": 0.06359903235103676, + "language_loss": 0.79155213, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80222422, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.29467773, + "step": 4210, + "time_per_iteration": 2.79875111579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065017, + "balance_loss_mlp": 1.03633547, + "epoch": 0.8101192766448634, + "flos": 479383183872.0, + "grad_norm": 0.056055581706419104, + "language_loss": 0.79616201, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80681217, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.28686523, + "step": 4211, + "time_per_iteration": 2.6358375549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065264, + "balance_loss_mlp": 1.03648686, + "epoch": 0.810311658330127, + "flos": 517078909440.0, + "grad_norm": 0.05344256107518821, + "language_loss": 0.86847901, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87913167, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.28759766, + "step": 4212, + "time_per_iteration": 2.6177706718444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069065, + "balance_loss_mlp": 1.04055011, + "epoch": 0.8105040400153906, + "flos": 575481867264.0, + "grad_norm": 0.04499674927197359, + "language_loss": 0.8394531, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85014379, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.28515625, + "step": 4213, + "time_per_iteration": 2.7992186546325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064046, + "balance_loss_mlp": 1.0357455, + "epoch": 0.8106964217006541, + "flos": 638658694656.0, + "grad_norm": 0.0596272682353905, + "language_loss": 0.84905821, + "learning_rate": 9.107029553743862e-05, + "loss": 0.85969865, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.28320312, + "step": 4214, + "time_per_iteration": 2.8410491943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072599, + "balance_loss_mlp": 1.04463267, + "epoch": 0.8108888033859176, + "flos": 579237225984.0, + "grad_norm": 0.07027285717141396, + "language_loss": 0.81110525, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82183123, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.2800293, + "step": 4215, + "time_per_iteration": 2.6683573722839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_mlp": 1.03414786, + "epoch": 0.8110811850711812, + "flos": 559612371456.0, + "grad_norm": 0.052081038567736, + "language_loss": 0.83540303, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84602392, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.27978516, + "step": 4216, + "time_per_iteration": 2.7824838161468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019607, + "balance_loss_mlp": 1.00854468, + "epoch": 0.8112735667564448, + "flos": 1517160969216.0, + "grad_norm": 0.011434458590002855, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78280026, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.11083984, + "step": 4217, + "time_per_iteration": 4.637202978134155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065471, + "balance_loss_mlp": 1.03657508, + "epoch": 0.8114659484417084, + "flos": 616048270848.0, + "grad_norm": 0.07350914645250498, + "language_loss": 0.85149193, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86214668, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.2890625, + "step": 4218, + "time_per_iteration": 2.782702684402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.0393765, + "epoch": 0.8116583301269719, + "flos": 649624872960.0, + "grad_norm": 0.048207191207865485, + "language_loss": 0.7901873, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80086124, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.27990723, + "step": 4219, + "time_per_iteration": 2.930854558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.03798532, + "epoch": 0.8118507118122354, + "flos": 552811806720.0, + "grad_norm": 0.059588246465710766, + "language_loss": 0.80647886, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81714863, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.28979492, + "step": 4220, + "time_per_iteration": 2.711641550064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067087, + "balance_loss_mlp": 1.03857219, + "epoch": 0.812043093497499, + "flos": 543854946816.0, + "grad_norm": 0.050033964999099186, + "language_loss": 0.87859094, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88926178, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.28540039, + "step": 4221, + "time_per_iteration": 2.6416759490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066094, + "balance_loss_mlp": 1.0373888, + "epoch": 0.8122354751827626, + "flos": 583113268224.0, + "grad_norm": 0.05826144530446981, + "language_loss": 0.83350205, + "learning_rate": 8.964124513805628e-05, + "loss": 0.844163, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.28735352, + "step": 4222, + "time_per_iteration": 2.8018221855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020384, + "balance_loss_mlp": 1.00932121, + "epoch": 0.8124278568680262, + "flos": 1529747970048.0, + "grad_norm": 0.011965334136789936, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79270458, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.11083984, + "step": 4223, + "time_per_iteration": 5.00577974319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067514, + "balance_loss_mlp": 1.03826034, + "epoch": 0.8126202385532897, + "flos": 432640438272.0, + "grad_norm": 0.06449105451981865, + "language_loss": 0.79671866, + "learning_rate": 8.928557430748668e-05, + "loss": 0.80739379, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.29248047, + "step": 4224, + "time_per_iteration": 2.5818302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018632, + "balance_loss_mlp": 1.00756931, + "epoch": 0.8128126202385533, + "flos": 1547098665984.0, + "grad_norm": 0.01031409207183129, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77514255, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.11083984, + "step": 4225, + "time_per_iteration": 4.809314727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069, + "balance_loss_mlp": 1.04081905, + "epoch": 0.8130050019238169, + "flos": 528064026624.0, + "grad_norm": 0.053998637656794475, + "language_loss": 0.88875234, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89944232, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.28173828, + "step": 4226, + "time_per_iteration": 2.647254705429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067755, + "balance_loss_mlp": 1.0394311, + "epoch": 0.8131973836090804, + "flos": 542850992640.0, + "grad_norm": 0.06919588802005232, + "language_loss": 0.79975605, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81043363, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.28320312, + "step": 4227, + "time_per_iteration": 2.726672410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070162, + "balance_loss_mlp": 1.0421958, + "epoch": 0.8133897652943439, + "flos": 576223953408.0, + "grad_norm": 0.0543859382223631, + "language_loss": 0.82038212, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83108377, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.2800293, + "step": 4228, + "time_per_iteration": 2.6778459548950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072299, + "balance_loss_mlp": 1.04404676, + "epoch": 0.8135821469796075, + "flos": 578937480192.0, + "grad_norm": 0.061060781274094984, + "language_loss": 0.78928632, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80000931, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.28222656, + "step": 4229, + "time_per_iteration": 2.7937610149383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069859, + "balance_loss_mlp": 1.04208326, + "epoch": 0.8137745286648711, + "flos": 650023543296.0, + "grad_norm": 0.05733446372690566, + "language_loss": 0.83721739, + "learning_rate": 8.822239090334472e-05, + "loss": 0.84791595, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.27783203, + "step": 4230, + "time_per_iteration": 2.929072141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.03980827, + "epoch": 0.8139669103501347, + "flos": 701579446272.0, + "grad_norm": 0.055172445682410025, + "language_loss": 0.75769949, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7683872, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.28955078, + "step": 4231, + "time_per_iteration": 2.9424638748168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.04748487, + "epoch": 0.8141592920353983, + "flos": 649933383168.0, + "grad_norm": 0.06203096167120011, + "language_loss": 0.83420956, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84496653, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.28198242, + "step": 4232, + "time_per_iteration": 2.758073091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010716, + "balance_loss_mlp": 1.04377663, + "epoch": 0.8143516737206618, + "flos": 536577136128.0, + "grad_norm": 0.060605640781893975, + "language_loss": 0.81175333, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82246929, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.27856445, + "step": 4233, + "time_per_iteration": 2.5830631256103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068573, + "balance_loss_mlp": 1.04041553, + "epoch": 0.8145440554059253, + "flos": 508117667328.0, + "grad_norm": 0.06950622119523395, + "language_loss": 0.82293272, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83361846, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.28149414, + "step": 4234, + "time_per_iteration": 2.5787734985351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069076, + "balance_loss_mlp": 1.04125214, + "epoch": 0.8147364370911889, + "flos": 634720043520.0, + "grad_norm": 0.06263020713850469, + "language_loss": 0.86699188, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87768269, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.27856445, + "step": 4235, + "time_per_iteration": 2.821223735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_mlp": 1.03743625, + "epoch": 0.8149288187764525, + "flos": 422576317440.0, + "grad_norm": 0.0652768049797803, + "language_loss": 0.78442669, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79509175, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.29052734, + "step": 4236, + "time_per_iteration": 2.4675498008728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_mlp": 1.03853941, + "epoch": 0.8151212004617161, + "flos": 597150945792.0, + "grad_norm": 0.07602341505053914, + "language_loss": 0.81648099, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82714319, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.27709961, + "step": 4237, + "time_per_iteration": 2.7590246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014981, + "balance_loss_mlp": 1.00382304, + "epoch": 0.8153135821469796, + "flos": 1478563186176.0, + "grad_norm": 0.008637050381311823, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78867829, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.11181641, + "step": 4238, + "time_per_iteration": 4.97124171257019 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071451, + "balance_loss_mlp": 1.04236352, + "epoch": 0.8155059638322432, + "flos": 436870070784.0, + "grad_norm": 0.06276879844041765, + "language_loss": 0.82607353, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83678806, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.29052734, + "step": 4239, + "time_per_iteration": 2.5108706951141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067786, + "balance_loss_mlp": 1.04036808, + "epoch": 0.8156983455175068, + "flos": 794034842112.0, + "grad_norm": 0.05812108506294299, + "language_loss": 0.85652077, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86719859, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.27490234, + "step": 4240, + "time_per_iteration": 3.0461056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069826, + "balance_loss_mlp": 1.04166925, + "epoch": 0.8158907272027703, + "flos": 685685219328.0, + "grad_norm": 0.05086326935086867, + "language_loss": 0.81733894, + "learning_rate": 8.628817947092616e-05, + "loss": 0.8280372, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.28173828, + "step": 4241, + "time_per_iteration": 2.8256101608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_mlp": 1.04071116, + "epoch": 0.8160831088880338, + "flos": 486812353536.0, + "grad_norm": 0.07447614758134384, + "language_loss": 0.84482515, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85551053, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.27832031, + "step": 4242, + "time_per_iteration": 2.5818676948547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069793, + "balance_loss_mlp": 1.04144478, + "epoch": 0.8162754905732974, + "flos": 464635505664.0, + "grad_norm": 0.058835558932383195, + "language_loss": 0.80352938, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81422722, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.28369141, + "step": 4243, + "time_per_iteration": 2.5835306644439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012284, + "balance_loss_mlp": 1.00117409, + "epoch": 0.816467872258561, + "flos": 1238879577600.0, + "grad_norm": 0.007644288971294211, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76297402, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.11132812, + "step": 4244, + "time_per_iteration": 4.721221446990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_mlp": 1.04399562, + "epoch": 0.8166602539438246, + "flos": 686862291456.0, + "grad_norm": 0.059716392905671066, + "language_loss": 0.86529738, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87601984, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.2824707, + "step": 4245, + "time_per_iteration": 2.9283206462860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013346, + "balance_loss_mlp": 1.00228322, + "epoch": 0.8168526356290882, + "flos": 1489674779136.0, + "grad_norm": 0.007574465559788524, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73988086, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.11083984, + "step": 4246, + "time_per_iteration": 4.933375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_mlp": 1.04230392, + "epoch": 0.8170450173143516, + "flos": 577927733760.0, + "grad_norm": 0.046146442587004816, + "language_loss": 0.84699905, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85770321, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.28125, + "step": 4247, + "time_per_iteration": 2.73640513420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.04476547, + "epoch": 0.8172373989996152, + "flos": 570985984512.0, + "grad_norm": 0.057815489386057996, + "language_loss": 0.84281337, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85353732, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.27685547, + "step": 4248, + "time_per_iteration": 2.7503533363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070358, + "balance_loss_mlp": 1.04289269, + "epoch": 0.8174297806848788, + "flos": 528576178176.0, + "grad_norm": 0.05981419977857885, + "language_loss": 0.80560964, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81631327, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.27514648, + "step": 4249, + "time_per_iteration": 2.633897066116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066258, + "balance_loss_mlp": 1.03798175, + "epoch": 0.8176221623701424, + "flos": 568819514880.0, + "grad_norm": 0.05344644300420973, + "language_loss": 0.78959692, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80025947, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.28295898, + "step": 4250, + "time_per_iteration": 2.801112174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106823, + "balance_loss_mlp": 1.03904736, + "epoch": 0.8178145440554059, + "flos": 656226178560.0, + "grad_norm": 0.06435055632963133, + "language_loss": 0.80169028, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81237257, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.29174805, + "step": 4251, + "time_per_iteration": 2.854274272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_mlp": 1.04451823, + "epoch": 0.8180069257406695, + "flos": 545643095040.0, + "grad_norm": 0.047060822290908425, + "language_loss": 0.87586474, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88659286, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.28271484, + "step": 4252, + "time_per_iteration": 2.7088351249694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072153, + "balance_loss_mlp": 1.04359007, + "epoch": 0.818199307425933, + "flos": 618987350016.0, + "grad_norm": 0.07063067234583648, + "language_loss": 0.84892482, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85964632, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.28588867, + "step": 4253, + "time_per_iteration": 2.7312068939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107228, + "balance_loss_mlp": 1.04436111, + "epoch": 0.8183916891111966, + "flos": 647291077632.0, + "grad_norm": 0.06397953963457907, + "language_loss": 0.77154791, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78227079, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.27929688, + "step": 4254, + "time_per_iteration": 2.9217798709869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069202, + "balance_loss_mlp": 1.0413785, + "epoch": 0.8185840707964602, + "flos": 498875618304.0, + "grad_norm": 0.06114349210328935, + "language_loss": 0.77897936, + "learning_rate": 8.385457557424098e-05, + "loss": 0.78967136, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.27856445, + "step": 4255, + "time_per_iteration": 2.5912728309631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072136, + "balance_loss_mlp": 1.04407382, + "epoch": 0.8187764524817237, + "flos": 785885497344.0, + "grad_norm": 0.04533193109086393, + "language_loss": 0.79436147, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8050828, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.28051758, + "step": 4256, + "time_per_iteration": 3.0689914226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067872, + "balance_loss_mlp": 1.03961968, + "epoch": 0.8189688341669873, + "flos": 550443105792.0, + "grad_norm": 0.04938986425067683, + "language_loss": 0.80494475, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81562352, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.28271484, + "step": 4257, + "time_per_iteration": 2.8081703186035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016419, + "balance_loss_mlp": 1.00545204, + "epoch": 0.8191612158522509, + "flos": 1351247837184.0, + "grad_norm": 0.007513111853899237, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72165769, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.10986328, + "step": 4258, + "time_per_iteration": 4.860759973526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.03884852, + "epoch": 0.8193535975375145, + "flos": 543997541376.0, + "grad_norm": 0.09856847418015399, + "language_loss": 0.83568203, + "learning_rate": 8.316506833163318e-05, + "loss": 0.8463496, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.27893066, + "step": 4259, + "time_per_iteration": 2.6318304538726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067228, + "balance_loss_mlp": 1.0395236, + "epoch": 0.8195459792227779, + "flos": 865361014272.0, + "grad_norm": 0.04796086797261532, + "language_loss": 0.8533324, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86400461, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.27709961, + "step": 4260, + "time_per_iteration": 3.097459554672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107212, + "balance_loss_mlp": 1.04343772, + "epoch": 0.8197383609080415, + "flos": 569015953920.0, + "grad_norm": 0.06519487649428121, + "language_loss": 0.81389135, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82461256, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.28686523, + "step": 4261, + "time_per_iteration": 2.7116708755493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067185, + "balance_loss_mlp": 1.03905129, + "epoch": 0.8199307425933051, + "flos": 530546208768.0, + "grad_norm": 0.0751813797891333, + "language_loss": 0.85112655, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86179835, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.28100586, + "step": 4262, + "time_per_iteration": 2.6388864517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069284, + "balance_loss_mlp": 1.04086471, + "epoch": 0.8201231242785687, + "flos": 566781083136.0, + "grad_norm": 0.08342870078967202, + "language_loss": 0.85002542, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86071831, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.28393555, + "step": 4263, + "time_per_iteration": 2.720921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068453, + "balance_loss_mlp": 1.04067707, + "epoch": 0.8203155059638323, + "flos": 1230037913088.0, + "grad_norm": 0.05275924450375894, + "language_loss": 0.83059227, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84127676, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.27807617, + "step": 4264, + "time_per_iteration": 3.537928819656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065474, + "balance_loss_mlp": 1.03722143, + "epoch": 0.8205078876490958, + "flos": 573929445888.0, + "grad_norm": 0.07194471944274317, + "language_loss": 0.79793882, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80859357, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.2824707, + "step": 4265, + "time_per_iteration": 2.677060604095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_mlp": 1.04238069, + "epoch": 0.8207002693343594, + "flos": 931005467136.0, + "grad_norm": 0.05368978054218888, + "language_loss": 0.78217483, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79288435, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.28564453, + "step": 4266, + "time_per_iteration": 3.2091941833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069743, + "balance_loss_mlp": 1.04175258, + "epoch": 0.8208926510196229, + "flos": 549300939264.0, + "grad_norm": 0.05227458424297275, + "language_loss": 0.80184317, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81254053, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.27966309, + "step": 4267, + "time_per_iteration": 2.6503403186798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071065, + "balance_loss_mlp": 1.04224026, + "epoch": 0.8210850327048865, + "flos": 647876012544.0, + "grad_norm": 0.055684588368156915, + "language_loss": 0.81990433, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83061492, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.28833008, + "step": 4268, + "time_per_iteration": 2.8474974632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.03712273, + "epoch": 0.82127741439015, + "flos": 601227809280.0, + "grad_norm": 0.05335039866685649, + "language_loss": 0.81779087, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82844484, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.28271484, + "step": 4269, + "time_per_iteration": 2.7657508850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072754, + "balance_loss_mlp": 1.04412019, + "epoch": 0.8214697960754136, + "flos": 474577380864.0, + "grad_norm": 0.05697164885970493, + "language_loss": 0.83394897, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84467655, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.28637695, + "step": 4270, + "time_per_iteration": 2.7368545532226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069379, + "balance_loss_mlp": 1.04169905, + "epoch": 0.8216621777606772, + "flos": 903288084480.0, + "grad_norm": 0.06652407290888228, + "language_loss": 0.84682125, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85751498, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.27734375, + "step": 4271, + "time_per_iteration": 3.138782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067361, + "balance_loss_mlp": 1.0394659, + "epoch": 0.8218545594459408, + "flos": 514203849216.0, + "grad_norm": 0.05826092076561135, + "language_loss": 0.81998187, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83065546, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.27929688, + "step": 4272, + "time_per_iteration": 2.597888231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068521, + "balance_loss_mlp": 1.03998256, + "epoch": 0.8220469411312044, + "flos": 494282221056.0, + "grad_norm": 0.05026815750554843, + "language_loss": 0.86299402, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87367922, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.28515625, + "step": 4273, + "time_per_iteration": 2.562731981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067588, + "balance_loss_mlp": 1.03883505, + "epoch": 0.8222393228164678, + "flos": 386223579648.0, + "grad_norm": 0.07057858042680534, + "language_loss": 0.89472818, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90540403, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.28710938, + "step": 4274, + "time_per_iteration": 2.3851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072526, + "balance_loss_mlp": 1.0439868, + "epoch": 0.8224317045017314, + "flos": 536331234816.0, + "grad_norm": 0.057023216193292044, + "language_loss": 0.87000436, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88072956, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.28588867, + "step": 4275, + "time_per_iteration": 2.6285390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_mlp": 1.03897595, + "epoch": 0.822624086186995, + "flos": 554637832704.0, + "grad_norm": 0.060348854107393414, + "language_loss": 0.82261753, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83329076, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.28369141, + "step": 4276, + "time_per_iteration": 2.6405022144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.03544676, + "epoch": 0.8228164678722586, + "flos": 539296455168.0, + "grad_norm": 0.060894679283369814, + "language_loss": 0.79943031, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81007135, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.28637695, + "step": 4277, + "time_per_iteration": 2.6903491020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069836, + "balance_loss_mlp": 1.04134488, + "epoch": 0.8230088495575221, + "flos": 473001638400.0, + "grad_norm": 0.06253294625851578, + "language_loss": 0.7949158, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80561417, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.28515625, + "step": 4278, + "time_per_iteration": 2.727847099304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068873, + "balance_loss_mlp": 1.04066813, + "epoch": 0.8232012312427857, + "flos": 591403797504.0, + "grad_norm": 0.07862072734011541, + "language_loss": 0.82629663, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83698535, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.28222656, + "step": 4279, + "time_per_iteration": 2.7080447673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_mlp": 1.03942037, + "epoch": 0.8233936129280492, + "flos": 572359495680.0, + "grad_norm": 0.05419892783143061, + "language_loss": 0.74572438, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75640255, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.28417969, + "step": 4280, + "time_per_iteration": 2.7673287391662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071165, + "balance_loss_mlp": 1.04324651, + "epoch": 0.8235859946133128, + "flos": 730986052608.0, + "grad_norm": 0.05042929375958854, + "language_loss": 0.78113925, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79185092, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.27905273, + "step": 4281, + "time_per_iteration": 3.041469097137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070743, + "balance_loss_mlp": 1.04334915, + "epoch": 0.8237783762985764, + "flos": 570044639232.0, + "grad_norm": 0.0829894991194988, + "language_loss": 0.81421649, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82492399, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.27416992, + "step": 4282, + "time_per_iteration": 2.722827434539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_mlp": 1.01049173, + "epoch": 0.8239707579838399, + "flos": 1465437740544.0, + "grad_norm": 0.01007107364223027, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76318944, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.10791016, + "step": 4283, + "time_per_iteration": 5.00859522819519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064022, + "balance_loss_mlp": 1.0352931, + "epoch": 0.8241631396691035, + "flos": 467068225536.0, + "grad_norm": 0.060969567614712394, + "language_loss": 0.80811769, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81875789, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.28710938, + "step": 4284, + "time_per_iteration": 2.6598501205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019005, + "balance_loss_mlp": 1.0081805, + "epoch": 0.8243555213543671, + "flos": 1538648165376.0, + "grad_norm": 0.008520908509729544, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78953278, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.10839844, + "step": 4285, + "time_per_iteration": 4.975403308868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106329, + "balance_loss_mlp": 1.03549051, + "epoch": 0.8245479030396307, + "flos": 797072845824.0, + "grad_norm": 0.046519887355449104, + "language_loss": 0.82528639, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83591926, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.27807617, + "step": 4286, + "time_per_iteration": 3.1330010890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.03985691, + "epoch": 0.8247402847248941, + "flos": 645793910784.0, + "grad_norm": 0.05584365846418652, + "language_loss": 0.76650226, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77718425, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.28344727, + "step": 4287, + "time_per_iteration": 2.874102830886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067715, + "balance_loss_mlp": 1.03965342, + "epoch": 0.8249326664101577, + "flos": 604123218432.0, + "grad_norm": 0.046467705900978235, + "language_loss": 0.79150665, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80218387, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.28076172, + "step": 4288, + "time_per_iteration": 2.766347646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071486, + "balance_loss_mlp": 1.04344761, + "epoch": 0.8251250480954213, + "flos": 824006034432.0, + "grad_norm": 0.07598367215213916, + "language_loss": 0.85994101, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87065583, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.28076172, + "step": 4289, + "time_per_iteration": 3.083522319793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066482, + "balance_loss_mlp": 1.03877819, + "epoch": 0.8253174297806849, + "flos": 757060853760.0, + "grad_norm": 0.06810151606712053, + "language_loss": 0.78171742, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79238224, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.27758789, + "step": 4290, + "time_per_iteration": 2.9262516498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.03983259, + "epoch": 0.8255098114659485, + "flos": 794090096640.0, + "grad_norm": 0.050696526133381645, + "language_loss": 0.87615943, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88684154, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.28369141, + "step": 4291, + "time_per_iteration": 3.1552226543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066383, + "balance_loss_mlp": 1.03846407, + "epoch": 0.825702193151212, + "flos": 710102730240.0, + "grad_norm": 0.05400695782122637, + "language_loss": 0.7710315, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78169525, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.27954102, + "step": 4292, + "time_per_iteration": 2.874351739883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066694, + "balance_loss_mlp": 1.03920412, + "epoch": 0.8258945748364755, + "flos": 683076409344.0, + "grad_norm": 0.05651405628127392, + "language_loss": 0.80610162, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81676853, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.27514648, + "step": 4293, + "time_per_iteration": 2.913649559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064767, + "balance_loss_mlp": 1.03641856, + "epoch": 0.8260869565217391, + "flos": 594284649984.0, + "grad_norm": 0.0655769338996001, + "language_loss": 0.79001105, + "learning_rate": 7.724279585440186e-05, + "loss": 0.8006587, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.28369141, + "step": 4294, + "time_per_iteration": 2.6959924697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106916, + "balance_loss_mlp": 1.0408597, + "epoch": 0.8262793382070027, + "flos": 651189030912.0, + "grad_norm": 0.06271254598374965, + "language_loss": 0.85122335, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86191493, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.28320312, + "step": 4295, + "time_per_iteration": 2.778247594833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106329, + "balance_loss_mlp": 1.03472757, + "epoch": 0.8264717198922663, + "flos": 538665030144.0, + "grad_norm": 0.06229356932536235, + "language_loss": 0.84610021, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85673308, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.28564453, + "step": 4296, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066177, + "balance_loss_mlp": 1.0383538, + "epoch": 0.8266641015775298, + "flos": 538689761280.0, + "grad_norm": 0.05860018207960959, + "language_loss": 0.75458044, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76524222, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.27807617, + "step": 4297, + "time_per_iteration": 2.6441447734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066814, + "balance_loss_mlp": 1.03894281, + "epoch": 0.8268564832627934, + "flos": 585077506560.0, + "grad_norm": 0.10598149445543782, + "language_loss": 0.83691001, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84757817, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.27905273, + "step": 4298, + "time_per_iteration": 2.7704553604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063706, + "balance_loss_mlp": 1.0360496, + "epoch": 0.827048864948057, + "flos": 430434680832.0, + "grad_norm": 0.06384628613684656, + "language_loss": 0.84164608, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85228312, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.27685547, + "step": 4299, + "time_per_iteration": 2.4719276428222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066203, + "balance_loss_mlp": 1.03866601, + "epoch": 0.8272412466333205, + "flos": 1387915181568.0, + "grad_norm": 0.05066688700219157, + "language_loss": 0.85216463, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86282665, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.27563477, + "step": 4300, + "time_per_iteration": 3.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.03889418, + "epoch": 0.827433628318584, + "flos": 537952057344.0, + "grad_norm": 0.054637515745130344, + "language_loss": 0.82762563, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83829165, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.27734375, + "step": 4301, + "time_per_iteration": 2.718935966491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069831, + "balance_loss_mlp": 1.04172134, + "epoch": 0.8276260100038476, + "flos": 730404089856.0, + "grad_norm": 0.062402863690690924, + "language_loss": 0.77286649, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78356481, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.28125, + "step": 4302, + "time_per_iteration": 2.979245185852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069858, + "balance_loss_mlp": 1.04239202, + "epoch": 0.8278183916891112, + "flos": 870713874432.0, + "grad_norm": 0.0506617431069229, + "language_loss": 0.82704937, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83774793, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.27490234, + "step": 4303, + "time_per_iteration": 3.2166168689727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04016745, + "epoch": 0.8280107733743748, + "flos": 594255536640.0, + "grad_norm": 0.04830635372046053, + "language_loss": 0.77627051, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78694797, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.27612305, + "step": 4304, + "time_per_iteration": 2.784526824951172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_mlp": 1.04625297, + "epoch": 0.8282031550596384, + "flos": 768253994496.0, + "grad_norm": 0.06238860390142307, + "language_loss": 0.84069538, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85143757, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.27978516, + "step": 4305, + "time_per_iteration": 3.015488624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064963, + "balance_loss_mlp": 1.03675771, + "epoch": 0.8283955367449019, + "flos": 695775481344.0, + "grad_norm": 0.05683033196778672, + "language_loss": 0.77687621, + "learning_rate": 7.525844574130947e-05, + "loss": 0.78752589, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.28222656, + "step": 4306, + "time_per_iteration": 2.9644808769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066712, + "balance_loss_mlp": 1.03919816, + "epoch": 0.8285879184301654, + "flos": 660304452096.0, + "grad_norm": 0.06215000066971459, + "language_loss": 0.82671452, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83738166, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.27514648, + "step": 4307, + "time_per_iteration": 2.9103474617004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071596, + "balance_loss_mlp": 1.04320002, + "epoch": 0.828780300115429, + "flos": 558444063744.0, + "grad_norm": 0.06487976021582191, + "language_loss": 0.77909887, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78981483, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.28417969, + "step": 4308, + "time_per_iteration": 2.667210340499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067943, + "balance_loss_mlp": 1.03940439, + "epoch": 0.8289726818006926, + "flos": 830613132288.0, + "grad_norm": 0.05657563872509185, + "language_loss": 0.81739187, + "learning_rate": 7.476606412570352e-05, + "loss": 0.82807136, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.28540039, + "step": 4309, + "time_per_iteration": 3.112323760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068169, + "balance_loss_mlp": 1.04036903, + "epoch": 0.8291650634859561, + "flos": 731974040064.0, + "grad_norm": 0.06578058701317972, + "language_loss": 0.81024778, + "learning_rate": 7.460226701651624e-05, + "loss": 0.82092953, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.27807617, + "step": 4310, + "time_per_iteration": 2.8983981609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_mlp": 1.03902817, + "epoch": 0.8293574451712197, + "flos": 860521715712.0, + "grad_norm": 0.047369684545673044, + "language_loss": 0.81142193, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82209897, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.28662109, + "step": 4311, + "time_per_iteration": 3.1817171573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068307, + "balance_loss_mlp": 1.04048347, + "epoch": 0.8295498268564833, + "flos": 494874358272.0, + "grad_norm": 0.047477241670426974, + "language_loss": 0.81896996, + "learning_rate": 7.427516832380948e-05, + "loss": 0.82965302, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.27856445, + "step": 4312, + "time_per_iteration": 2.823458671569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067657, + "balance_loss_mlp": 1.04007173, + "epoch": 0.8297422085417469, + "flos": 554176553472.0, + "grad_norm": 0.05048838223449801, + "language_loss": 0.77711129, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78778785, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.27612305, + "step": 4313, + "time_per_iteration": 2.7841291427612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064359, + "balance_loss_mlp": 1.03624964, + "epoch": 0.8299345902270104, + "flos": 1247001523200.0, + "grad_norm": 0.053354105207562584, + "language_loss": 0.77411175, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78475529, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.28100586, + "step": 4314, + "time_per_iteration": 3.7357640266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071472, + "balance_loss_mlp": 1.04350495, + "epoch": 0.8301269719122739, + "flos": 584974199808.0, + "grad_norm": 0.06431532292793385, + "language_loss": 0.83130819, + "learning_rate": 7.378576005087034e-05, + "loss": 0.8420229, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.2800293, + "step": 4315, + "time_per_iteration": 2.7655749320983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.03692484, + "epoch": 0.8303193535975375, + "flos": 509472239616.0, + "grad_norm": 0.05482661069569197, + "language_loss": 0.85277319, + "learning_rate": 7.362295481759412e-05, + "loss": 0.863424, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.28198242, + "step": 4316, + "time_per_iteration": 2.6888644695281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065395, + "balance_loss_mlp": 1.03711891, + "epoch": 0.8305117352828011, + "flos": 580375010304.0, + "grad_norm": 0.06137401051825932, + "language_loss": 0.83732426, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84797823, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.28271484, + "step": 4317, + "time_per_iteration": 2.73391056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106906, + "balance_loss_mlp": 1.04035425, + "epoch": 0.8307041169680647, + "flos": 481372153344.0, + "grad_norm": 0.368897655418688, + "language_loss": 0.78677309, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79746372, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.28686523, + "step": 4318, + "time_per_iteration": 2.6239781379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071574, + "balance_loss_mlp": 1.04284477, + "epoch": 0.8308964986533282, + "flos": 624319861248.0, + "grad_norm": 0.05860908770024719, + "language_loss": 0.83063138, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84134716, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.28662109, + "step": 4319, + "time_per_iteration": 2.7097573280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068434, + "balance_loss_mlp": 1.04025316, + "epoch": 0.8310888803385917, + "flos": 826974226944.0, + "grad_norm": 0.06711883496181308, + "language_loss": 0.78550565, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79618996, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.28222656, + "step": 4320, + "time_per_iteration": 3.0357778072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107015, + "balance_loss_mlp": 1.0416826, + "epoch": 0.8312812620238553, + "flos": 583443537408.0, + "grad_norm": 0.05319992693282762, + "language_loss": 0.81702912, + "learning_rate": 7.281141292683746e-05, + "loss": 0.82773066, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.28491211, + "step": 4321, + "time_per_iteration": 2.8347558975219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.04227519, + "epoch": 0.8314736437091189, + "flos": 1115165560320.0, + "grad_norm": 0.06107038935899217, + "language_loss": 0.74773026, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75843954, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.28613281, + "step": 4322, + "time_per_iteration": 3.407073497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071357, + "balance_loss_mlp": 1.04262769, + "epoch": 0.8316660253943825, + "flos": 517295697408.0, + "grad_norm": 0.07290266812750479, + "language_loss": 0.8181231, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82883668, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.28710938, + "step": 4323, + "time_per_iteration": 2.848313093185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_mlp": 1.04257011, + "epoch": 0.831858407079646, + "flos": 794989334016.0, + "grad_norm": 0.05477920158119857, + "language_loss": 0.78024107, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79094219, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.27563477, + "step": 4324, + "time_per_iteration": 3.032369375228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076179, + "balance_loss_mlp": 1.048141, + "epoch": 0.8320507887649096, + "flos": 549699609600.0, + "grad_norm": 0.05911320807574519, + "language_loss": 0.82844627, + "learning_rate": 7.216516432290843e-05, + "loss": 0.83920801, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.28076172, + "step": 4325, + "time_per_iteration": 2.675576686859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.04580855, + "epoch": 0.8322431704501732, + "flos": 479160603648.0, + "grad_norm": 0.06505000909529828, + "language_loss": 0.81961429, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83035839, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.28588867, + "step": 4326, + "time_per_iteration": 2.6014811992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072863, + "balance_loss_mlp": 1.04430079, + "epoch": 0.8324355521354367, + "flos": 572157264384.0, + "grad_norm": 0.04659300495959616, + "language_loss": 0.8545717, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86530042, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.28564453, + "step": 4327, + "time_per_iteration": 2.686389446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04311395, + "epoch": 0.8326279338207002, + "flos": 503208557568.0, + "grad_norm": 0.0509990045281191, + "language_loss": 0.82115221, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83185971, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.27685547, + "step": 4328, + "time_per_iteration": 2.6402134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_mlp": 1.04157257, + "epoch": 0.8328203155059638, + "flos": 605442885120.0, + "grad_norm": 0.04952821718361573, + "language_loss": 0.80924785, + "learning_rate": 7.152157332111364e-05, + "loss": 0.81994963, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.28588867, + "step": 4329, + "time_per_iteration": 2.9259705543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068872, + "balance_loss_mlp": 1.04033327, + "epoch": 0.8330126971912274, + "flos": 697469087232.0, + "grad_norm": 0.04841901744892354, + "language_loss": 0.85735106, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86803973, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.28564453, + "step": 4330, + "time_per_iteration": 2.9183027744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107316, + "balance_loss_mlp": 1.0450027, + "epoch": 0.833205078876491, + "flos": 493799182848.0, + "grad_norm": 0.054568548047455055, + "language_loss": 0.86655569, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87728733, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.28149414, + "step": 4331, + "time_per_iteration": 2.5831360816955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069703, + "balance_loss_mlp": 1.04176021, + "epoch": 0.8333974605617546, + "flos": 482568164352.0, + "grad_norm": 0.055811576976186876, + "language_loss": 0.8251605, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83585751, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.27954102, + "step": 4332, + "time_per_iteration": 2.5941505432128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070936, + "balance_loss_mlp": 1.0433507, + "epoch": 0.833589842247018, + "flos": 686517465600.0, + "grad_norm": 0.06675763221573856, + "language_loss": 0.82810611, + "learning_rate": 7.088064391927818e-05, + "loss": 0.83881545, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.27612305, + "step": 4333, + "time_per_iteration": 2.8070662021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071446, + "balance_loss_mlp": 1.04231119, + "epoch": 0.8337822239322816, + "flos": 881377486848.0, + "grad_norm": 0.06204820087732955, + "language_loss": 0.82370806, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83442253, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.29101562, + "step": 4334, + "time_per_iteration": 3.121647834777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106899, + "balance_loss_mlp": 1.0413332, + "epoch": 0.8339746056175452, + "flos": 496940493312.0, + "grad_norm": 0.0625443757441557, + "language_loss": 0.8238197, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83450961, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.27685547, + "step": 4335, + "time_per_iteration": 2.6120407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074986, + "balance_loss_mlp": 1.04601824, + "epoch": 0.8341669873028088, + "flos": 510244849152.0, + "grad_norm": 0.06721642404221422, + "language_loss": 0.86205637, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87280619, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.28979492, + "step": 4336, + "time_per_iteration": 2.663907289505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072023, + "balance_loss_mlp": 1.04348373, + "epoch": 0.8343593689880723, + "flos": 692017302528.0, + "grad_norm": 0.06048352118494476, + "language_loss": 0.84131467, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85203493, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.28540039, + "step": 4337, + "time_per_iteration": 2.83551287651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.04602623, + "epoch": 0.8345517506733359, + "flos": 552132329472.0, + "grad_norm": 0.07231250032753044, + "language_loss": 0.78381979, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79456496, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.28491211, + "step": 4338, + "time_per_iteration": 2.792090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_mlp": 1.04401958, + "epoch": 0.8347441323585995, + "flos": 591750033408.0, + "grad_norm": 0.05928271157327828, + "language_loss": 0.76391554, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77464879, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.29296875, + "step": 4339, + "time_per_iteration": 2.8094851970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071335, + "balance_loss_mlp": 1.04346359, + "epoch": 0.834936514043863, + "flos": 614625297408.0, + "grad_norm": 0.08334347707601203, + "language_loss": 0.84719282, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85790616, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.27905273, + "step": 4340, + "time_per_iteration": 2.7984797954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_mlp": 1.04096282, + "epoch": 0.8351288957291266, + "flos": 467590551552.0, + "grad_norm": 0.05919982272479659, + "language_loss": 0.79683816, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80752361, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.27612305, + "step": 4341, + "time_per_iteration": 2.6437861919403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068887, + "balance_loss_mlp": 1.04046774, + "epoch": 0.8353212774143901, + "flos": 509063394816.0, + "grad_norm": 0.05870432477932672, + "language_loss": 0.78877068, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79945958, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.28417969, + "step": 4342, + "time_per_iteration": 2.666900157928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068798, + "balance_loss_mlp": 1.04121327, + "epoch": 0.8355136590996537, + "flos": 687477749760.0, + "grad_norm": 0.05888286602994688, + "language_loss": 0.80899429, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81968236, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.27612305, + "step": 4343, + "time_per_iteration": 2.825812339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070236, + "balance_loss_mlp": 1.04217434, + "epoch": 0.8357060407849173, + "flos": 984019249152.0, + "grad_norm": 0.07015017683216763, + "language_loss": 0.83694071, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84764308, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.28076172, + "step": 4344, + "time_per_iteration": 3.2398900985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071314, + "balance_loss_mlp": 1.04315686, + "epoch": 0.8358984224701809, + "flos": 842657458176.0, + "grad_norm": 0.04900467059707895, + "language_loss": 0.8505708, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86128396, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.28149414, + "step": 4345, + "time_per_iteration": 3.166189432144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071828, + "balance_loss_mlp": 1.04424298, + "epoch": 0.8360908041554445, + "flos": 625945065984.0, + "grad_norm": 0.05480008181708294, + "language_loss": 0.81788313, + "learning_rate": 6.881605304306748e-05, + "loss": 0.82860136, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.27612305, + "step": 4346, + "time_per_iteration": 2.732534170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067997, + "balance_loss_mlp": 1.03967237, + "epoch": 0.8362831858407079, + "flos": 575781613056.0, + "grad_norm": 0.05694009909818929, + "language_loss": 0.84824663, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85892665, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.28344727, + "step": 4347, + "time_per_iteration": 2.7295114994049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068483, + "balance_loss_mlp": 1.03946793, + "epoch": 0.8364755675259715, + "flos": 833434348032.0, + "grad_norm": 0.07161579074567852, + "language_loss": 0.80623019, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81691504, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.29003906, + "step": 4348, + "time_per_iteration": 3.0577757358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070469, + "balance_loss_mlp": 1.04193068, + "epoch": 0.8366679492112351, + "flos": 582211210752.0, + "grad_norm": 0.05716211740110942, + "language_loss": 0.86482334, + "learning_rate": 6.834362168884912e-05, + "loss": 0.8755281, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.28540039, + "step": 4349, + "time_per_iteration": 2.68066143989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069343, + "balance_loss_mlp": 1.04018426, + "epoch": 0.8368603308964987, + "flos": 611434524672.0, + "grad_norm": 0.061462223772575715, + "language_loss": 0.87587225, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88656569, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.29125977, + "step": 4350, + "time_per_iteration": 2.7892367839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107107, + "balance_loss_mlp": 1.04310322, + "epoch": 0.8370527125817622, + "flos": 507014788608.0, + "grad_norm": 0.05225473338782787, + "language_loss": 0.8561269, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86683762, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.2800293, + "step": 4351, + "time_per_iteration": 2.7321066856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066463, + "balance_loss_mlp": 1.03787637, + "epoch": 0.8372450942670258, + "flos": 770621285376.0, + "grad_norm": 0.049979512165668406, + "language_loss": 0.825046, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83571064, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.28564453, + "step": 4352, + "time_per_iteration": 2.9381721019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.04192686, + "epoch": 0.8374374759522893, + "flos": 579005881344.0, + "grad_norm": 0.053029874390874816, + "language_loss": 0.84654623, + "learning_rate": 6.771605967466033e-05, + "loss": 0.85725045, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.28491211, + "step": 4353, + "time_per_iteration": 2.691183090209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.03847289, + "epoch": 0.8376298576375529, + "flos": 787781334528.0, + "grad_norm": 0.08828757782414506, + "language_loss": 0.82668114, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83734941, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.28344727, + "step": 4354, + "time_per_iteration": 2.9823262691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067001, + "balance_loss_mlp": 1.03827119, + "epoch": 0.8378222393228165, + "flos": 577337006592.0, + "grad_norm": 0.05380745974011456, + "language_loss": 0.80749297, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81816292, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.28710938, + "step": 4355, + "time_per_iteration": 2.7626209259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067012, + "balance_loss_mlp": 1.03883111, + "epoch": 0.83801462100808, + "flos": 480618482688.0, + "grad_norm": 0.05633813219245277, + "language_loss": 0.81979787, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83046794, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.28198242, + "step": 4356, + "time_per_iteration": 2.6264963150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069337, + "balance_loss_mlp": 1.04089344, + "epoch": 0.8382070026933436, + "flos": 550523091456.0, + "grad_norm": 0.05081476567396691, + "language_loss": 0.89207625, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90276963, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.28442383, + "step": 4357, + "time_per_iteration": 2.842620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_mlp": 1.04205918, + "epoch": 0.8383993843786072, + "flos": 624655922688.0, + "grad_norm": 0.07040298629212442, + "language_loss": 0.8180182, + "learning_rate": 6.693538372929725e-05, + "loss": 0.82872057, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.28198242, + "step": 4358, + "time_per_iteration": 2.916688919067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063253, + "balance_loss_mlp": 1.03504848, + "epoch": 0.8385917660638708, + "flos": 490928504832.0, + "grad_norm": 0.06176298645937789, + "language_loss": 0.86094594, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87157845, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.28222656, + "step": 4359, + "time_per_iteration": 2.5380067825317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065272, + "balance_loss_mlp": 1.03675675, + "epoch": 0.8387841477491342, + "flos": 466659380736.0, + "grad_norm": 0.05670082707538084, + "language_loss": 0.86943793, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88009059, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.28515625, + "step": 4360, + "time_per_iteration": 2.5779833793640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013763, + "balance_loss_mlp": 1.00289118, + "epoch": 0.8389765294343978, + "flos": 1563339128832.0, + "grad_norm": 0.010559991711123677, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72793949, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.10888672, + "step": 4361, + "time_per_iteration": 4.992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_mlp": 1.03729582, + "epoch": 0.8391689111196614, + "flos": 601849059840.0, + "grad_norm": 0.04961232743748672, + "language_loss": 0.8271215, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83777601, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.28125, + "step": 4362, + "time_per_iteration": 2.8584516048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064915, + "balance_loss_mlp": 1.0364244, + "epoch": 0.839361292804925, + "flos": 442818040320.0, + "grad_norm": 0.06952447203418213, + "language_loss": 0.80247456, + "learning_rate": 6.615891104554261e-05, + "loss": 0.8131237, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.28491211, + "step": 4363, + "time_per_iteration": 2.55979585647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065438, + "balance_loss_mlp": 1.03654134, + "epoch": 0.8395536744901886, + "flos": 593885979648.0, + "grad_norm": 0.05610159931926655, + "language_loss": 0.82741809, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83807242, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.28881836, + "step": 4364, + "time_per_iteration": 2.7487361431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065297, + "balance_loss_mlp": 1.03752112, + "epoch": 0.8397460561754521, + "flos": 889462812672.0, + "grad_norm": 0.05813866241406409, + "language_loss": 0.85143423, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86208725, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.27783203, + "step": 4365, + "time_per_iteration": 3.1682748794555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066254, + "balance_loss_mlp": 1.03759646, + "epoch": 0.8399384378607156, + "flos": 553239590400.0, + "grad_norm": 0.056651294792781116, + "language_loss": 0.89333951, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90400201, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.28637695, + "step": 4366, + "time_per_iteration": 2.639616012573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062691, + "balance_loss_mlp": 1.03386617, + "epoch": 0.8401308195459792, + "flos": 518664826368.0, + "grad_norm": 0.04871038923450433, + "language_loss": 0.83355534, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84418219, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.28808594, + "step": 4367, + "time_per_iteration": 2.6365461349487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067679, + "balance_loss_mlp": 1.03925979, + "epoch": 0.8403232012312428, + "flos": 684593925120.0, + "grad_norm": 0.053676716876516345, + "language_loss": 0.80734771, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81802452, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.28417969, + "step": 4368, + "time_per_iteration": 3.066606044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067558, + "balance_loss_mlp": 1.03925812, + "epoch": 0.8405155829165063, + "flos": 577424346624.0, + "grad_norm": 0.06042544525246531, + "language_loss": 0.77456969, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78524524, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.28320312, + "step": 4369, + "time_per_iteration": 2.682929515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.03591669, + "epoch": 0.8407079646017699, + "flos": 456393028608.0, + "grad_norm": 0.061853619977902334, + "language_loss": 0.87804818, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88869584, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.28857422, + "step": 4370, + "time_per_iteration": 2.565526008605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068309, + "balance_loss_mlp": 1.03977025, + "epoch": 0.8409003462870335, + "flos": 534647803392.0, + "grad_norm": 0.06142629372428209, + "language_loss": 0.81581974, + "learning_rate": 6.492531633106114e-05, + "loss": 0.8265028, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.28515625, + "step": 4371, + "time_per_iteration": 2.7487144470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.03757024, + "epoch": 0.8410927279722971, + "flos": 556475443200.0, + "grad_norm": 0.0604641524505276, + "language_loss": 0.77816391, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78882331, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.28369141, + "step": 4372, + "time_per_iteration": 2.717592477798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011247, + "balance_loss_mlp": 1.00027978, + "epoch": 0.8412851096575606, + "flos": 1548963979776.0, + "grad_norm": 0.008659597915800551, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78690368, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.10986328, + "step": 4373, + "time_per_iteration": 4.928239583969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065882, + "balance_loss_mlp": 1.03791547, + "epoch": 0.8414774913428241, + "flos": 551777329152.0, + "grad_norm": 0.06413098641466736, + "language_loss": 0.78880799, + "learning_rate": 6.446550370075271e-05, + "loss": 0.79946685, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.27978516, + "step": 4374, + "time_per_iteration": 2.7013869285583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.0385704, + "epoch": 0.8416698730280877, + "flos": 572752373760.0, + "grad_norm": 0.061373783777205176, + "language_loss": 0.77249122, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78315514, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.27832031, + "step": 4375, + "time_per_iteration": 2.693763017654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_mlp": 1.03818846, + "epoch": 0.8418622547133513, + "flos": 758405251584.0, + "grad_norm": 0.06754827285553786, + "language_loss": 0.79854172, + "learning_rate": 6.415980729547543e-05, + "loss": 0.8092109, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.28710938, + "step": 4376, + "time_per_iteration": 2.9242076873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_mlp": 1.03532255, + "epoch": 0.8420546363986149, + "flos": 1073717448192.0, + "grad_norm": 0.06121521331908178, + "language_loss": 0.72551686, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73615307, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.28295898, + "step": 4377, + "time_per_iteration": 3.4273428916931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106422, + "balance_loss_mlp": 1.03601491, + "epoch": 0.8422470180838784, + "flos": 525632716800.0, + "grad_norm": 0.0571677989475448, + "language_loss": 0.82702553, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83766776, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.28198242, + "step": 4378, + "time_per_iteration": 2.815692901611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069501, + "balance_loss_mlp": 1.04043794, + "epoch": 0.842439399769142, + "flos": 600552714240.0, + "grad_norm": 0.05646259355458583, + "language_loss": 0.82160503, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83230007, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.2902832, + "step": 4379, + "time_per_iteration": 2.7401773929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067014, + "balance_loss_mlp": 1.03783143, + "epoch": 0.8426317814544055, + "flos": 551935890432.0, + "grad_norm": 0.05535422523937343, + "language_loss": 0.86565614, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87632632, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.29150391, + "step": 4380, + "time_per_iteration": 2.7968811988830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063585, + "balance_loss_mlp": 1.03526044, + "epoch": 0.8428241631396691, + "flos": 678531064320.0, + "grad_norm": 0.06626388248762789, + "language_loss": 0.77773583, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78837168, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.28320312, + "step": 4381, + "time_per_iteration": 2.8227858543395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064902, + "balance_loss_mlp": 1.03676867, + "epoch": 0.8430165448249327, + "flos": 753365131776.0, + "grad_norm": 0.04710188444733999, + "language_loss": 0.79544091, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80608988, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.28149414, + "step": 4382, + "time_per_iteration": 3.1052286624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067685, + "balance_loss_mlp": 1.03943205, + "epoch": 0.8432089265101962, + "flos": 698817867264.0, + "grad_norm": 0.054966422102889954, + "language_loss": 0.8069393, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81761611, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.28271484, + "step": 4383, + "time_per_iteration": 2.9371731281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065323, + "balance_loss_mlp": 1.03690386, + "epoch": 0.8434013081954598, + "flos": 485513035776.0, + "grad_norm": 0.061244567985189666, + "language_loss": 0.84519708, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85585028, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.28393555, + "step": 4384, + "time_per_iteration": 2.651392936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067975, + "balance_loss_mlp": 1.03957903, + "epoch": 0.8435936898807234, + "flos": 520372988928.0, + "grad_norm": 0.06241169246185324, + "language_loss": 0.85226697, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86294675, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.28442383, + "step": 4385, + "time_per_iteration": 2.6139471530914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069425, + "balance_loss_mlp": 1.04119599, + "epoch": 0.843786071565987, + "flos": 785604690432.0, + "grad_norm": 0.05527502128053877, + "language_loss": 0.80296469, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81365895, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.2824707, + "step": 4386, + "time_per_iteration": 2.9919278621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005481, + "balance_loss_mlp": 0.9944663, + "epoch": 0.8439784532512504, + "flos": 1445472442368.0, + "grad_norm": 0.008775712178222237, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76841992, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.11035156, + "step": 4387, + "time_per_iteration": 4.946225166320801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065228, + "balance_loss_mlp": 1.03621244, + "epoch": 0.844170834936514, + "flos": 708384393216.0, + "grad_norm": 0.057945273917654624, + "language_loss": 0.82541668, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83606899, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.28955078, + "step": 4388, + "time_per_iteration": 2.8812713623046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067854, + "balance_loss_mlp": 1.03983974, + "epoch": 0.8443632166217776, + "flos": 483183622656.0, + "grad_norm": 0.057167711375516135, + "language_loss": 0.79827619, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80895472, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.28027344, + "step": 4389, + "time_per_iteration": 2.6971452236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070953, + "balance_loss_mlp": 1.04274869, + "epoch": 0.8445555983070412, + "flos": 678388469760.0, + "grad_norm": 0.12061127187216408, + "language_loss": 0.80305707, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81376654, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.2824707, + "step": 4390, + "time_per_iteration": 2.8095004558563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072504, + "balance_loss_mlp": 1.04401278, + "epoch": 0.8447479799923048, + "flos": 741143305728.0, + "grad_norm": 0.05847375315335963, + "language_loss": 0.74079317, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75151819, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.28466797, + "step": 4391, + "time_per_iteration": 2.9872703552246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010687, + "balance_loss_mlp": 1.04137695, + "epoch": 0.8449403616775683, + "flos": 952897125888.0, + "grad_norm": 0.05269410537387695, + "language_loss": 0.80129778, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81198478, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.2734375, + "step": 4392, + "time_per_iteration": 3.261303186416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071231, + "balance_loss_mlp": 1.04235804, + "epoch": 0.8451327433628318, + "flos": 657054042624.0, + "grad_norm": 0.06670363382703816, + "language_loss": 0.71812409, + "learning_rate": 6.158876260634871e-05, + "loss": 0.72883642, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.28857422, + "step": 4393, + "time_per_iteration": 2.943547010421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067618, + "balance_loss_mlp": 1.04055786, + "epoch": 0.8453251250480954, + "flos": 445880775168.0, + "grad_norm": 0.0616456163749573, + "language_loss": 0.83441478, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84509093, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.27124023, + "step": 4394, + "time_per_iteration": 2.5297040939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073842, + "balance_loss_mlp": 1.04475522, + "epoch": 0.845517506733359, + "flos": 542491610112.0, + "grad_norm": 0.0736590019033433, + "language_loss": 0.70597637, + "learning_rate": 6.128951512927305e-05, + "loss": 0.7167148, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.2902832, + "step": 4395, + "time_per_iteration": 2.6587178707122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068217, + "balance_loss_mlp": 1.04053688, + "epoch": 0.8457098884186226, + "flos": 502175490048.0, + "grad_norm": 0.050987666257807054, + "language_loss": 0.84470797, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85539019, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.27709961, + "step": 4396, + "time_per_iteration": 2.6455705165863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069179, + "balance_loss_mlp": 1.04106975, + "epoch": 0.8459022701038861, + "flos": 448643764224.0, + "grad_norm": 0.05256963604665797, + "language_loss": 0.79372364, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80441546, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.28125, + "step": 4397, + "time_per_iteration": 2.692250967025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068183, + "balance_loss_mlp": 1.040622, + "epoch": 0.8460946517891497, + "flos": 742855850496.0, + "grad_norm": 0.060494887314633476, + "language_loss": 0.74907923, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.7597611, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.27587891, + "step": 4398, + "time_per_iteration": 2.89249849319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066532, + "balance_loss_mlp": 1.03763604, + "epoch": 0.8462870334744133, + "flos": 552939844608.0, + "grad_norm": 0.04890785740935349, + "language_loss": 0.79848468, + "learning_rate": 6.069306450876389e-05, + "loss": 0.80915004, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.28833008, + "step": 4399, + "time_per_iteration": 2.771097421646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008457, + "balance_loss_mlp": 0.99753761, + "epoch": 0.8464794151596768, + "flos": 1564033162752.0, + "grad_norm": 0.008986072179428414, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82717025, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.109375, + "step": 4400, + "time_per_iteration": 4.860820055007935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065203, + "balance_loss_mlp": 1.03702164, + "epoch": 0.8466717968449403, + "flos": 549930954240.0, + "grad_norm": 0.05293623699929889, + "language_loss": 0.79682398, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80747598, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.28222656, + "step": 4401, + "time_per_iteration": 2.6743388175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067709, + "balance_loss_mlp": 1.03919387, + "epoch": 0.8468641785302039, + "flos": 551625970176.0, + "grad_norm": 0.06578160446582347, + "language_loss": 0.8447904, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85546756, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.28515625, + "step": 4402, + "time_per_iteration": 2.833575963973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.0428741, + "epoch": 0.8470565602154675, + "flos": 572384226816.0, + "grad_norm": 0.06284331857121975, + "language_loss": 0.87002754, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88074219, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.28588867, + "step": 4403, + "time_per_iteration": 2.729248285293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072184, + "balance_loss_mlp": 1.04428864, + "epoch": 0.8472489419007311, + "flos": 472597175808.0, + "grad_norm": 0.06686068658621137, + "language_loss": 0.84025908, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85098088, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.27929688, + "step": 4404, + "time_per_iteration": 2.5385451316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066659, + "balance_loss_mlp": 1.03804839, + "epoch": 0.8474413235859947, + "flos": 797682511872.0, + "grad_norm": 0.061353729013317905, + "language_loss": 0.79223871, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80290532, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.28613281, + "step": 4405, + "time_per_iteration": 2.973203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072442, + "balance_loss_mlp": 1.04457116, + "epoch": 0.8476337052712581, + "flos": 502130409984.0, + "grad_norm": 0.06582777621595964, + "language_loss": 0.80370855, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81443298, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.27929688, + "step": 4406, + "time_per_iteration": 2.5676045417785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106656, + "balance_loss_mlp": 1.03864169, + "epoch": 0.8478260869565217, + "flos": 931586019840.0, + "grad_norm": 0.04644248356743638, + "language_loss": 0.83144867, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84211433, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.27954102, + "step": 4407, + "time_per_iteration": 3.197460412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_mlp": 1.03937507, + "epoch": 0.8480184686417853, + "flos": 708502256640.0, + "grad_norm": 0.05623164949383599, + "language_loss": 0.80978203, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82045567, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.2800293, + "step": 4408, + "time_per_iteration": 2.885713815689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066856, + "balance_loss_mlp": 1.03819788, + "epoch": 0.8482108503270489, + "flos": 614152433664.0, + "grad_norm": 0.052126844540241135, + "language_loss": 0.82540518, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83607376, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.28686523, + "step": 4409, + "time_per_iteration": 2.788428783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071329, + "balance_loss_mlp": 1.04326701, + "epoch": 0.8484032320123124, + "flos": 530752822272.0, + "grad_norm": 0.07423031491114718, + "language_loss": 0.81877828, + "learning_rate": 5.906690709037194e-05, + "loss": 0.82949162, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.28100586, + "step": 4410, + "time_per_iteration": 2.687079429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101158, + "balance_loss_mlp": 1.00056553, + "epoch": 0.848595613697576, + "flos": 1541930508288.0, + "grad_norm": 0.009407978937322712, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77308494, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.11035156, + "step": 4411, + "time_per_iteration": 4.916358232498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066224, + "balance_loss_mlp": 1.0385201, + "epoch": 0.8487879953828396, + "flos": 677025133056.0, + "grad_norm": 0.05688304553915402, + "language_loss": 0.73515522, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74581748, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.27734375, + "step": 4412, + "time_per_iteration": 2.943319797515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066758, + "balance_loss_mlp": 1.03905368, + "epoch": 0.8489803770681031, + "flos": 503425345536.0, + "grad_norm": 0.06238044069939686, + "language_loss": 0.79501128, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80567884, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.27734375, + "step": 4413, + "time_per_iteration": 2.701700448989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04212689, + "epoch": 0.8491727587533667, + "flos": 562896276480.0, + "grad_norm": 0.056348175066762846, + "language_loss": 0.76581597, + "learning_rate": 5.84807086750247e-05, + "loss": 0.77651596, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.27905273, + "step": 4414, + "time_per_iteration": 2.7571372985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071841, + "balance_loss_mlp": 1.04325449, + "epoch": 0.8493651404386302, + "flos": 459544513536.0, + "grad_norm": 0.06822958630668063, + "language_loss": 0.77977884, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79049724, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.28588867, + "step": 4415, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_mlp": 1.04294157, + "epoch": 0.8495575221238938, + "flos": 460928199168.0, + "grad_norm": 0.07027331723408024, + "language_loss": 0.81720734, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82791978, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.28320312, + "step": 4416, + "time_per_iteration": 2.6256165504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.04189312, + "epoch": 0.8497499038091574, + "flos": 870353081856.0, + "grad_norm": 0.06359252463002799, + "language_loss": 0.81217146, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82286835, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.27807617, + "step": 4417, + "time_per_iteration": 3.1099212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071152, + "balance_loss_mlp": 1.04246998, + "epoch": 0.849942285494421, + "flos": 779258050560.0, + "grad_norm": 0.0545072417760316, + "language_loss": 0.77756029, + "learning_rate": 5.789725286620018e-05, + "loss": 0.78827178, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.28637695, + "step": 4418, + "time_per_iteration": 2.990246534347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067448, + "balance_loss_mlp": 1.03909969, + "epoch": 0.8501346671796844, + "flos": 513544720896.0, + "grad_norm": 0.06431104376952325, + "language_loss": 0.84794027, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85861474, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.28369141, + "step": 4419, + "time_per_iteration": 2.6921567916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04392087, + "epoch": 0.850327048864948, + "flos": 621149437440.0, + "grad_norm": 0.05225981984620765, + "language_loss": 0.83629984, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84702778, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.28833008, + "step": 4420, + "time_per_iteration": 2.843939781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070105, + "balance_loss_mlp": 1.04254413, + "epoch": 0.8505194305502116, + "flos": 773890633728.0, + "grad_norm": 0.056819561081510095, + "language_loss": 0.79734492, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80804604, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.27587891, + "step": 4421, + "time_per_iteration": 3.0194528102874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069285, + "balance_loss_mlp": 1.04024506, + "epoch": 0.8507118122354752, + "flos": 465019619328.0, + "grad_norm": 0.05129689451431866, + "language_loss": 0.86400151, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87469435, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.29052734, + "step": 4422, + "time_per_iteration": 2.5654053688049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068694, + "balance_loss_mlp": 1.04001236, + "epoch": 0.8509041939207388, + "flos": 534150208512.0, + "grad_norm": 0.05425122775065133, + "language_loss": 0.84819269, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85887969, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.28662109, + "step": 4423, + "time_per_iteration": 2.692744255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068587, + "balance_loss_mlp": 1.04031062, + "epoch": 0.8510965756060023, + "flos": 583466858496.0, + "grad_norm": 0.05733874896237715, + "language_loss": 0.84372598, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85441184, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.28295898, + "step": 4424, + "time_per_iteration": 2.7339928150177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_mlp": 1.03923082, + "epoch": 0.8512889572912659, + "flos": 600548332032.0, + "grad_norm": 0.05652647663414663, + "language_loss": 0.77350199, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78418159, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.28735352, + "step": 4425, + "time_per_iteration": 2.757702589035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068547, + "balance_loss_mlp": 1.03977025, + "epoch": 0.8514813389765294, + "flos": 654474345984.0, + "grad_norm": 0.056980152168257754, + "language_loss": 0.78801835, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79870379, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.28759766, + "step": 4426, + "time_per_iteration": 2.8438169956207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065367, + "balance_loss_mlp": 1.03742433, + "epoch": 0.851673720661793, + "flos": 429538415616.0, + "grad_norm": 0.06363966604968568, + "language_loss": 0.78092206, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79157573, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.27954102, + "step": 4427, + "time_per_iteration": 2.528083562850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069727, + "balance_loss_mlp": 1.04075956, + "epoch": 0.8518661023470565, + "flos": 641277679104.0, + "grad_norm": 0.06417237419479298, + "language_loss": 0.79616511, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80686241, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.28979492, + "step": 4428, + "time_per_iteration": 2.7901835441589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_mlp": 1.03828812, + "epoch": 0.8520584840323201, + "flos": 561880737792.0, + "grad_norm": 0.0607366331567848, + "language_loss": 0.79741931, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80809164, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.28930664, + "step": 4429, + "time_per_iteration": 2.685030698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_mlp": 1.04329574, + "epoch": 0.8522508657175837, + "flos": 526793822208.0, + "grad_norm": 0.06567497707339605, + "language_loss": 0.78606403, + "learning_rate": 5.61633772363489e-05, + "loss": 0.7967785, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.28198242, + "step": 4430, + "time_per_iteration": 2.594611644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063898, + "balance_loss_mlp": 1.03497767, + "epoch": 0.8524432474028473, + "flos": 498875618304.0, + "grad_norm": 0.05326350302130372, + "language_loss": 0.80760658, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81824553, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.2890625, + "step": 4431, + "time_per_iteration": 2.5856552124023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106631, + "balance_loss_mlp": 1.03834355, + "epoch": 0.8526356290881109, + "flos": 421089325056.0, + "grad_norm": 0.073571565136352, + "language_loss": 0.79417092, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80483401, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.27954102, + "step": 4432, + "time_per_iteration": 2.480302095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069706, + "balance_loss_mlp": 1.0413816, + "epoch": 0.8528280107733743, + "flos": 507078807552.0, + "grad_norm": 0.05899053033855359, + "language_loss": 0.80417913, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.8148762, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.28320312, + "step": 4433, + "time_per_iteration": 2.5961601734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067688, + "balance_loss_mlp": 1.03950715, + "epoch": 0.8530203924586379, + "flos": 445663987200.0, + "grad_norm": 0.07402673493705796, + "language_loss": 0.82934564, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84002256, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.28198242, + "step": 4434, + "time_per_iteration": 2.504897356033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067566, + "balance_loss_mlp": 1.03945613, + "epoch": 0.8532127741439015, + "flos": 657452712960.0, + "grad_norm": 0.06775200512771863, + "language_loss": 0.83294642, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84362209, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.28100586, + "step": 4435, + "time_per_iteration": 2.8722753524780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060812, + "balance_loss_mlp": 1.03201151, + "epoch": 0.8534051558291651, + "flos": 535750682112.0, + "grad_norm": 0.049124463523354554, + "language_loss": 0.83115995, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84176803, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.2878418, + "step": 4436, + "time_per_iteration": 2.7586135864257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069476, + "balance_loss_mlp": 1.04081857, + "epoch": 0.8535975375144286, + "flos": 532741791744.0, + "grad_norm": 0.07253609135717012, + "language_loss": 0.78917527, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.79987001, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.28637695, + "step": 4437, + "time_per_iteration": 2.665250301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064267, + "balance_loss_mlp": 1.03591907, + "epoch": 0.8537899191996922, + "flos": 573861044736.0, + "grad_norm": 0.06315024185021119, + "language_loss": 0.82323515, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83387786, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.28344727, + "step": 4438, + "time_per_iteration": 2.6837167739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_mlp": 1.04019177, + "epoch": 0.8539823008849557, + "flos": 464759161344.0, + "grad_norm": 0.0810478140018265, + "language_loss": 0.83188379, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84256798, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.28271484, + "step": 4439, + "time_per_iteration": 2.6771581172943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065679, + "balance_loss_mlp": 1.03687835, + "epoch": 0.8541746825702193, + "flos": 554441393664.0, + "grad_norm": 0.05580742758143019, + "language_loss": 0.8114894, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82214624, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.2878418, + "step": 4440, + "time_per_iteration": 2.703986644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066517, + "balance_loss_mlp": 1.0385263, + "epoch": 0.8543670642554829, + "flos": 546101402112.0, + "grad_norm": 0.07237493250834019, + "language_loss": 0.77604347, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78670859, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.28027344, + "step": 4441, + "time_per_iteration": 2.809252977371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106809, + "balance_loss_mlp": 1.03988481, + "epoch": 0.8545594459407464, + "flos": 511766747136.0, + "grad_norm": 0.05090580444418766, + "language_loss": 0.82180196, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83248281, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.28222656, + "step": 4442, + "time_per_iteration": 2.6570944786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_mlp": 1.04098618, + "epoch": 0.85475182762601, + "flos": 420961287168.0, + "grad_norm": 0.060096294700318055, + "language_loss": 0.81646609, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82715702, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.28149414, + "step": 4443, + "time_per_iteration": 2.5243723392486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065725, + "balance_loss_mlp": 1.03792512, + "epoch": 0.8549442093112736, + "flos": 389222295552.0, + "grad_norm": 0.06469608643018868, + "language_loss": 0.773826, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78448325, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.27807617, + "step": 4444, + "time_per_iteration": 2.51920223236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_mlp": 1.03558922, + "epoch": 0.8551365909965372, + "flos": 605620385280.0, + "grad_norm": 0.059194132325457664, + "language_loss": 0.79776013, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80839705, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.28125, + "step": 4445, + "time_per_iteration": 2.801943063735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.03671026, + "epoch": 0.8553289726818007, + "flos": 503912765952.0, + "grad_norm": 0.05844778654273943, + "language_loss": 0.78704023, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.79770029, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.29272461, + "step": 4446, + "time_per_iteration": 2.6274378299713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_mlp": 1.03914738, + "epoch": 0.8555213543670642, + "flos": 556735901184.0, + "grad_norm": 0.0545348209619519, + "language_loss": 0.759287, + "learning_rate": 5.374955106561324e-05, + "loss": 0.76996648, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.28808594, + "step": 4447, + "time_per_iteration": 2.781522274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066487, + "balance_loss_mlp": 1.03852105, + "epoch": 0.8557137360523278, + "flos": 547843060224.0, + "grad_norm": 0.05508059918721569, + "language_loss": 0.74790716, + "learning_rate": 5.360911790663775e-05, + "loss": 0.7585721, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.2800293, + "step": 4448, + "time_per_iteration": 2.681140184402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106353, + "balance_loss_mlp": 1.03518176, + "epoch": 0.8559061177375914, + "flos": 727853506560.0, + "grad_norm": 0.05884214790792896, + "language_loss": 0.78717124, + "learning_rate": 5.346885805197238e-05, + "loss": 0.7978065, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.28369141, + "step": 4449, + "time_per_iteration": 3.0732901096343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068527, + "balance_loss_mlp": 1.03967822, + "epoch": 0.856098499422855, + "flos": 535608087552.0, + "grad_norm": 0.06758405280159155, + "language_loss": 0.82919681, + "learning_rate": 5.332877155607085e-05, + "loss": 0.83988202, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.28857422, + "step": 4450, + "time_per_iteration": 2.658113479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071731, + "balance_loss_mlp": 1.04352653, + "epoch": 0.8562908811081185, + "flos": 573388180992.0, + "grad_norm": 0.06293317417138165, + "language_loss": 0.83193231, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84264964, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.28222656, + "step": 4451, + "time_per_iteration": 2.7371931076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106554, + "balance_loss_mlp": 1.03700137, + "epoch": 0.856483262793382, + "flos": 781391024640.0, + "grad_norm": 0.06311736302267067, + "language_loss": 0.80342978, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81408519, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.28564453, + "step": 4452, + "time_per_iteration": 3.095228433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_mlp": 1.03533196, + "epoch": 0.8566756444786456, + "flos": 455585513472.0, + "grad_norm": 0.057168939084114495, + "language_loss": 0.84728843, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85792524, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.28344727, + "step": 4453, + "time_per_iteration": 2.595768690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.04072213, + "epoch": 0.8568680261639092, + "flos": 449150123520.0, + "grad_norm": 0.058366089298651294, + "language_loss": 0.8424089, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85309124, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.27514648, + "step": 4454, + "time_per_iteration": 2.5411229133605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068899, + "balance_loss_mlp": 1.04107571, + "epoch": 0.8570604078491728, + "flos": 479736774144.0, + "grad_norm": 0.07223117728599122, + "language_loss": 0.82632047, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83700949, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.27856445, + "step": 4455, + "time_per_iteration": 2.5366690158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_mlp": 1.03765273, + "epoch": 0.8572527895344363, + "flos": 505695121920.0, + "grad_norm": 0.061147295474926186, + "language_loss": 0.84813732, + "learning_rate": 5.249189615562627e-05, + "loss": 0.85880041, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.28613281, + "step": 4456, + "time_per_iteration": 2.5954558849334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065891, + "balance_loss_mlp": 1.03771043, + "epoch": 0.8574451712196999, + "flos": 786688630272.0, + "grad_norm": 0.05061557722226465, + "language_loss": 0.83000439, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84066331, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.28222656, + "step": 4457, + "time_per_iteration": 3.1139042377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065238, + "balance_loss_mlp": 1.03629398, + "epoch": 0.8576375529049635, + "flos": 508980436992.0, + "grad_norm": 0.05994421710631203, + "language_loss": 0.75134158, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.761994, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.28881836, + "step": 4458, + "time_per_iteration": 2.6730198860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007078, + "balance_loss_mlp": 0.99591976, + "epoch": 0.857829934590227, + "flos": 1459996130304.0, + "grad_norm": 0.007822702191887595, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85774368, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.11181641, + "step": 4459, + "time_per_iteration": 4.979666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067045, + "balance_loss_mlp": 1.03779101, + "epoch": 0.8580223162754905, + "flos": 479057296896.0, + "grad_norm": 0.05274398336577564, + "language_loss": 0.89012241, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90079284, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.29223633, + "step": 4460, + "time_per_iteration": 2.6836555004119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067615, + "balance_loss_mlp": 1.03931475, + "epoch": 0.8582146979607541, + "flos": 705926942208.0, + "grad_norm": 0.07596315948303173, + "language_loss": 0.79420805, + "learning_rate": 5.179927728591227e-05, + "loss": 0.8048842, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.28295898, + "step": 4461, + "time_per_iteration": 2.853403329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066518, + "balance_loss_mlp": 1.03807497, + "epoch": 0.8584070796460177, + "flos": 764826084864.0, + "grad_norm": 0.06387758845808282, + "language_loss": 0.82548052, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83614576, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.28442383, + "step": 4462, + "time_per_iteration": 2.9871556758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070395, + "balance_loss_mlp": 1.04204726, + "epoch": 0.8585994613312813, + "flos": 586535385600.0, + "grad_norm": 0.057586538294609683, + "language_loss": 0.8564322, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86713612, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.28369141, + "step": 4463, + "time_per_iteration": 2.8135032653808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.03678679, + "epoch": 0.8587918430165449, + "flos": 607993468416.0, + "grad_norm": 0.05234014265771045, + "language_loss": 0.78836596, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79902256, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.28881836, + "step": 4464, + "time_per_iteration": 2.7817888259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068079, + "balance_loss_mlp": 1.03963614, + "epoch": 0.8589842247018084, + "flos": 588710619648.0, + "grad_norm": 0.06230218500152689, + "language_loss": 0.8085956, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81927645, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.28466797, + "step": 4465, + "time_per_iteration": 2.691600799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065653, + "balance_loss_mlp": 1.03768635, + "epoch": 0.8591766063870719, + "flos": 543609045504.0, + "grad_norm": 0.07971528973299408, + "language_loss": 0.78662705, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79728359, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.27978516, + "step": 4466, + "time_per_iteration": 2.647693157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106499, + "balance_loss_mlp": 1.03661847, + "epoch": 0.8593689880723355, + "flos": 493499437056.0, + "grad_norm": 0.057340189460979636, + "language_loss": 0.80966145, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.82031131, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.28369141, + "step": 4467, + "time_per_iteration": 2.670189619064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070601, + "balance_loss_mlp": 1.04294395, + "epoch": 0.8595613697575991, + "flos": 533652613632.0, + "grad_norm": 0.07143678371041538, + "language_loss": 0.83760196, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84830797, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.27685547, + "step": 4468, + "time_per_iteration": 2.6147608757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068362, + "balance_loss_mlp": 1.04034781, + "epoch": 0.8597537514428626, + "flos": 617347588608.0, + "grad_norm": 0.061866552118211966, + "language_loss": 0.75730455, + "learning_rate": 5.070013822961328e-05, + "loss": 0.7679882, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.28027344, + "step": 4469, + "time_per_iteration": 2.7232584953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.03580678, + "epoch": 0.8599461331281262, + "flos": 608450365440.0, + "grad_norm": 0.05685660271928497, + "language_loss": 0.83694613, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84758651, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.2824707, + "step": 4470, + "time_per_iteration": 2.777681827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068517, + "balance_loss_mlp": 1.04009736, + "epoch": 0.8601385148133898, + "flos": 550979988480.0, + "grad_norm": 0.05506266431023708, + "language_loss": 0.82577848, + "learning_rate": 5.042709673802786e-05, + "loss": 0.83646369, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.28417969, + "step": 4471, + "time_per_iteration": 2.6651957035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.03836131, + "epoch": 0.8603308964986534, + "flos": 580907510784.0, + "grad_norm": 0.06361138287055206, + "language_loss": 0.8119573, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82263255, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.29125977, + "step": 4472, + "time_per_iteration": 2.867305278778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068349, + "balance_loss_mlp": 1.04002476, + "epoch": 0.8605232781839169, + "flos": 628731376128.0, + "grad_norm": 0.061969617336128574, + "language_loss": 0.75447845, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76516187, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.28344727, + "step": 4473, + "time_per_iteration": 2.791969060897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064299, + "balance_loss_mlp": 1.03607023, + "epoch": 0.8607156598691804, + "flos": 467904854016.0, + "grad_norm": 0.06996386665919671, + "language_loss": 0.77089655, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78153956, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.2824707, + "step": 4474, + "time_per_iteration": 2.5247669219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064708, + "balance_loss_mlp": 1.03655124, + "epoch": 0.860908041554444, + "flos": 488142194688.0, + "grad_norm": 0.07527102079674898, + "language_loss": 0.82489771, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83554482, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.28149414, + "step": 4475, + "time_per_iteration": 2.6851634979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.03987718, + "epoch": 0.8611004232397076, + "flos": 591827198976.0, + "grad_norm": 0.06066793633900129, + "language_loss": 0.80281663, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81348914, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.27392578, + "step": 4476, + "time_per_iteration": 2.7129712104797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068054, + "balance_loss_mlp": 1.04018247, + "epoch": 0.8612928049249712, + "flos": 773857138176.0, + "grad_norm": 0.055176333782017764, + "language_loss": 0.85914743, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.86982793, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.27905273, + "step": 4477, + "time_per_iteration": 3.055014133453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067412, + "balance_loss_mlp": 1.03868282, + "epoch": 0.8614851866102347, + "flos": 537291518976.0, + "grad_norm": 0.06049058254958562, + "language_loss": 0.82140207, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83207619, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.28710938, + "step": 4478, + "time_per_iteration": 2.6869184970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066452, + "balance_loss_mlp": 1.03889072, + "epoch": 0.8616775682954982, + "flos": 565647681024.0, + "grad_norm": 0.08549280129733618, + "language_loss": 0.79003942, + "learning_rate": 4.934191658211729e-05, + "loss": 0.800704, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.27587891, + "step": 4479, + "time_per_iteration": 2.6531260013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065181, + "balance_loss_mlp": 1.03638005, + "epoch": 0.8618699499807618, + "flos": 481351804416.0, + "grad_norm": 0.11855450332692621, + "language_loss": 0.81331623, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82396805, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.2878418, + "step": 4480, + "time_per_iteration": 2.6510956287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066433, + "balance_loss_mlp": 1.03684497, + "epoch": 0.8620623316660254, + "flos": 649214618112.0, + "grad_norm": 0.07320616066460611, + "language_loss": 0.74202549, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75268984, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.2956543, + "step": 4481, + "time_per_iteration": 2.761094331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065154, + "balance_loss_mlp": 1.03706789, + "epoch": 0.862254713351289, + "flos": 751457710080.0, + "grad_norm": 0.06277275556700077, + "language_loss": 0.8580991, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86875063, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.28100586, + "step": 4482, + "time_per_iteration": 2.967822790145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069099, + "balance_loss_mlp": 1.03998828, + "epoch": 0.8624470950365525, + "flos": 841147144704.0, + "grad_norm": 0.06010002710742802, + "language_loss": 0.77420175, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78489274, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.29101562, + "step": 4483, + "time_per_iteration": 3.2577385902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067825, + "balance_loss_mlp": 1.03957295, + "epoch": 0.8626394767218161, + "flos": 754470982656.0, + "grad_norm": 0.07300953897576297, + "language_loss": 0.82941705, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84009528, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.28222656, + "step": 4484, + "time_per_iteration": 2.9013171195983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067464, + "balance_loss_mlp": 1.03856742, + "epoch": 0.8628318584070797, + "flos": 703268669952.0, + "grad_norm": 0.06168064749637158, + "language_loss": 0.82346129, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83413589, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.28881836, + "step": 4485, + "time_per_iteration": 2.921997308731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068391, + "balance_loss_mlp": 1.03954196, + "epoch": 0.8630242400923432, + "flos": 471006876672.0, + "grad_norm": 0.061195678734605755, + "language_loss": 0.77577496, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78645885, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.28808594, + "step": 4486, + "time_per_iteration": 2.5501646995544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067068, + "balance_loss_mlp": 1.0388155, + "epoch": 0.8632166217776067, + "flos": 963965200896.0, + "grad_norm": 0.07614172176482971, + "language_loss": 0.77287424, + "learning_rate": 4.826793390639783e-05, + "loss": 0.7835449, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.2824707, + "step": 4487, + "time_per_iteration": 3.2161014080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_mlp": 1.03665614, + "epoch": 0.8634090034628703, + "flos": 767583281664.0, + "grad_norm": 0.06353304542331387, + "language_loss": 0.78799319, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79864818, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.28833008, + "step": 4488, + "time_per_iteration": 2.9450225830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065894, + "balance_loss_mlp": 1.03697419, + "epoch": 0.8636013851481339, + "flos": 520310380032.0, + "grad_norm": 0.06251575685184195, + "language_loss": 0.82971573, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84037471, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.28881836, + "step": 4489, + "time_per_iteration": 2.740370512008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067401, + "balance_loss_mlp": 1.03917265, + "epoch": 0.8637937668333975, + "flos": 631858129920.0, + "grad_norm": 0.06333852042335102, + "language_loss": 0.80451763, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81519163, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.28198242, + "step": 4490, + "time_per_iteration": 2.7359256744384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068762, + "balance_loss_mlp": 1.04010427, + "epoch": 0.8639861485186611, + "flos": 855739233792.0, + "grad_norm": 0.05047353967061317, + "language_loss": 0.76060426, + "learning_rate": 4.773514997362e-05, + "loss": 0.77129185, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.28662109, + "step": 4491, + "time_per_iteration": 3.0797441005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071015, + "balance_loss_mlp": 1.04261971, + "epoch": 0.8641785302039245, + "flos": 481017153024.0, + "grad_norm": 0.05674318342180607, + "language_loss": 0.77455688, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.785267, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.28417969, + "step": 4492, + "time_per_iteration": 2.520038366317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_mlp": 1.04135144, + "epoch": 0.8643709118891881, + "flos": 504385629696.0, + "grad_norm": 0.06254727528350278, + "language_loss": 0.80063522, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81133652, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.2878418, + "step": 4493, + "time_per_iteration": 2.5938947200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065619, + "balance_loss_mlp": 1.03712773, + "epoch": 0.8645632935744517, + "flos": 552074102784.0, + "grad_norm": 0.055240372629072394, + "language_loss": 0.82212245, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83277869, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.28466797, + "step": 4494, + "time_per_iteration": 2.77341365814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066304, + "balance_loss_mlp": 1.03869498, + "epoch": 0.8647556752597153, + "flos": 524489140224.0, + "grad_norm": 0.15546869391129756, + "language_loss": 0.84280682, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85346985, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.27636719, + "step": 4495, + "time_per_iteration": 2.574237108230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070237, + "balance_loss_mlp": 1.04196072, + "epoch": 0.8649480569449788, + "flos": 787403013120.0, + "grad_norm": 0.05684902230614366, + "language_loss": 0.81967145, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83037388, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.28271484, + "step": 4496, + "time_per_iteration": 3.0772690773010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066128, + "balance_loss_mlp": 1.03770816, + "epoch": 0.8651404386302424, + "flos": 763531149312.0, + "grad_norm": 0.05956401155270589, + "language_loss": 0.76680404, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77746534, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.28417969, + "step": 4497, + "time_per_iteration": 3.0376369953155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064599, + "balance_loss_mlp": 1.03603673, + "epoch": 0.865332820315506, + "flos": 539620932096.0, + "grad_norm": 0.05886756519779109, + "language_loss": 0.82413983, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83478582, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.28564453, + "step": 4498, + "time_per_iteration": 2.8252370357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004967, + "balance_loss_mlp": 0.99366641, + "epoch": 0.8655252020007695, + "flos": 1475874390528.0, + "grad_norm": 0.009480995024256391, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80179417, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.11279297, + "step": 4499, + "time_per_iteration": 4.7803168296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066203, + "balance_loss_mlp": 1.03714013, + "epoch": 0.8657175836860331, + "flos": 517094876160.0, + "grad_norm": 0.05771110198912223, + "language_loss": 0.82750368, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83816576, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.29052734, + "step": 4500, + "time_per_iteration": 2.737542152404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067037, + "balance_loss_mlp": 1.03892779, + "epoch": 0.8659099653712966, + "flos": 590247074304.0, + "grad_norm": 0.05908664540109528, + "language_loss": 0.80244732, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81311762, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.28149414, + "step": 4501, + "time_per_iteration": 2.7165422439575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064384, + "balance_loss_mlp": 1.03613114, + "epoch": 0.8661023470565602, + "flos": 590168498688.0, + "grad_norm": 0.06739029778355735, + "language_loss": 0.87976968, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89041352, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.28271484, + "step": 4502, + "time_per_iteration": 2.880788803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062872, + "balance_loss_mlp": 1.03440487, + "epoch": 0.8662947287418238, + "flos": 567405305856.0, + "grad_norm": 0.04997855335218525, + "language_loss": 0.79264534, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80327404, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.28491211, + "step": 4503, + "time_per_iteration": 2.7816219329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065958, + "balance_loss_mlp": 1.03703749, + "epoch": 0.8664871104270874, + "flos": 515661728256.0, + "grad_norm": 0.046072741879525626, + "language_loss": 0.82059586, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83125544, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.2890625, + "step": 4504, + "time_per_iteration": 2.8134889602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068809, + "balance_loss_mlp": 1.04010344, + "epoch": 0.866679492112351, + "flos": 556973038080.0, + "grad_norm": 0.052643351801927495, + "language_loss": 0.78038937, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79107749, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.28710938, + "step": 4505, + "time_per_iteration": 2.845855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03399956, + "epoch": 0.8668718737976144, + "flos": 722145646080.0, + "grad_norm": 0.07179310361545532, + "language_loss": 0.81647635, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82710177, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.28515625, + "step": 4506, + "time_per_iteration": 2.9011623859405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065183, + "balance_loss_mlp": 1.03633463, + "epoch": 0.867064255482878, + "flos": 600424676352.0, + "grad_norm": 0.07537486330186009, + "language_loss": 0.84679854, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85745037, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.28833008, + "step": 4507, + "time_per_iteration": 2.6858415603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059998, + "balance_loss_mlp": 1.03184044, + "epoch": 0.8672566371681416, + "flos": 803177966592.0, + "grad_norm": 0.05593310912213693, + "language_loss": 0.76031673, + "learning_rate": 4.550219979745529e-05, + "loss": 0.7709167, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.28125, + "step": 4508, + "time_per_iteration": 3.0288636684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.03439939, + "epoch": 0.8674490188534052, + "flos": 627072675840.0, + "grad_norm": 0.06601583141232006, + "language_loss": 0.83780628, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84843922, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.2890625, + "step": 4509, + "time_per_iteration": 2.739122152328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106569, + "balance_loss_mlp": 1.0366981, + "epoch": 0.8676414005386687, + "flos": 727489741824.0, + "grad_norm": 0.058760625067754736, + "language_loss": 0.86417365, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87483048, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.28979492, + "step": 4510, + "time_per_iteration": 2.9747283458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065116, + "balance_loss_mlp": 1.03671992, + "epoch": 0.8678337822239323, + "flos": 539676186624.0, + "grad_norm": 0.06474660971141381, + "language_loss": 0.80936235, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.82001352, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.28393555, + "step": 4511, + "time_per_iteration": 2.756804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064399, + "balance_loss_mlp": 1.03576529, + "epoch": 0.8680261639091958, + "flos": 507270864384.0, + "grad_norm": 0.06943904903889057, + "language_loss": 0.79382515, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80446917, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.28613281, + "step": 4512, + "time_per_iteration": 2.5936288833618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066971, + "balance_loss_mlp": 1.03824139, + "epoch": 0.8682185455944594, + "flos": 486871990272.0, + "grad_norm": 0.05745344948747144, + "language_loss": 0.80955023, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82021987, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.28710938, + "step": 4513, + "time_per_iteration": 2.6303818225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061644, + "balance_loss_mlp": 1.03324854, + "epoch": 0.868410927279723, + "flos": 603413217792.0, + "grad_norm": 0.0600055800011045, + "language_loss": 0.80860162, + "learning_rate": 4.472626206030528e-05, + "loss": 0.8192181, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.28417969, + "step": 4514, + "time_per_iteration": 2.6981005668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065518, + "balance_loss_mlp": 1.03612089, + "epoch": 0.8686033089649865, + "flos": 1118552772096.0, + "grad_norm": 0.07859483635461387, + "language_loss": 0.8481617, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85881692, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.29370117, + "step": 4515, + "time_per_iteration": 3.3843672275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106426, + "balance_loss_mlp": 1.03491068, + "epoch": 0.8687956906502501, + "flos": 567750131712.0, + "grad_norm": 0.06750496140695705, + "language_loss": 0.83204174, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84268439, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.29321289, + "step": 4516, + "time_per_iteration": 2.724592447280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066807, + "balance_loss_mlp": 1.03798199, + "epoch": 0.8689880723355137, + "flos": 544071734784.0, + "grad_norm": 0.055889262061819295, + "language_loss": 0.8429358, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85360384, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.28833008, + "step": 4517, + "time_per_iteration": 2.689141035079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_mlp": 1.0338918, + "epoch": 0.8691804540207773, + "flos": 457185987072.0, + "grad_norm": 0.05116361101584782, + "language_loss": 0.86430299, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87492657, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.28466797, + "step": 4518, + "time_per_iteration": 2.6708133220672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067852, + "balance_loss_mlp": 1.03909898, + "epoch": 0.8693728357060407, + "flos": 591591472128.0, + "grad_norm": 0.055626041012955325, + "language_loss": 0.79863721, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80931574, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.28735352, + "step": 4519, + "time_per_iteration": 2.6947073936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_mlp": 1.03498292, + "epoch": 0.8695652173913043, + "flos": 679949655552.0, + "grad_norm": 0.0650046136300286, + "language_loss": 0.79432595, + "learning_rate": 4.395668742181164e-05, + "loss": 0.8049624, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.28662109, + "step": 4520, + "time_per_iteration": 2.923346519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065559, + "balance_loss_mlp": 1.03740191, + "epoch": 0.8697575990765679, + "flos": 492120133632.0, + "grad_norm": 0.06406228380921414, + "language_loss": 0.78534073, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79599631, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.28149414, + "step": 4521, + "time_per_iteration": 2.5783751010894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066142, + "balance_loss_mlp": 1.03819942, + "epoch": 0.8699499807618315, + "flos": 526690515456.0, + "grad_norm": 0.06561086282942073, + "language_loss": 0.8186453, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82930666, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.27954102, + "step": 4522, + "time_per_iteration": 2.690821647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066904, + "balance_loss_mlp": 1.03915191, + "epoch": 0.8701423624470951, + "flos": 813981201408.0, + "grad_norm": 0.0550322760692221, + "language_loss": 0.79950142, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81017047, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.27783203, + "step": 4523, + "time_per_iteration": 3.158989191055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_mlp": 1.03633487, + "epoch": 0.8703347441323586, + "flos": 556519113216.0, + "grad_norm": 0.052059598956666925, + "language_loss": 0.88351029, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89416325, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.28979492, + "step": 4524, + "time_per_iteration": 2.662280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068125, + "balance_loss_mlp": 1.03982425, + "epoch": 0.8705271258176221, + "flos": 585151699968.0, + "grad_norm": 0.06832788170157987, + "language_loss": 0.84436864, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85504991, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.28295898, + "step": 4525, + "time_per_iteration": 2.825425863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.03210771, + "epoch": 0.8707195075028857, + "flos": 668896137216.0, + "grad_norm": 0.05033137477703021, + "language_loss": 0.85244215, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86304605, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.28271484, + "step": 4526, + "time_per_iteration": 2.86771297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_mlp": 1.03997612, + "epoch": 0.8709118891881493, + "flos": 520122705408.0, + "grad_norm": 0.055440044956439165, + "language_loss": 0.83684516, + "learning_rate": 4.306690693781007e-05, + "loss": 0.8475334, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.28833008, + "step": 4527, + "time_per_iteration": 2.7739171981811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.03594756, + "epoch": 0.8711042708734128, + "flos": 552944226816.0, + "grad_norm": 0.06369806188789202, + "language_loss": 0.8152144, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82585543, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.28149414, + "step": 4528, + "time_per_iteration": 2.632436513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069291, + "balance_loss_mlp": 1.04079986, + "epoch": 0.8712966525586764, + "flos": 501933970944.0, + "grad_norm": 0.06599031197153508, + "language_loss": 0.82279682, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83348972, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.28491211, + "step": 4529, + "time_per_iteration": 2.7143640518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03783274, + "epoch": 0.87148903424394, + "flos": 803739580416.0, + "grad_norm": 0.05606476399314808, + "language_loss": 0.73884034, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74950159, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.28320312, + "step": 4530, + "time_per_iteration": 3.0463168621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.03940248, + "epoch": 0.8716814159292036, + "flos": 773088910848.0, + "grad_norm": 0.04934837250946328, + "language_loss": 0.85875851, + "learning_rate": 4.256236259953489e-05, + "loss": 0.86944056, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.2878418, + "step": 4531, + "time_per_iteration": 3.0251410007476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.03944206, + "epoch": 0.8718737976144671, + "flos": 486595565568.0, + "grad_norm": 0.0657223096896028, + "language_loss": 0.84869027, + "learning_rate": 4.243667037531468e-05, + "loss": 0.85937339, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.28857422, + "step": 4532, + "time_per_iteration": 2.6163856983184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061697, + "balance_loss_mlp": 1.03339648, + "epoch": 0.8720661792997306, + "flos": 583850972160.0, + "grad_norm": 0.05979867957502993, + "language_loss": 0.78658783, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79720485, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.28344727, + "step": 4533, + "time_per_iteration": 2.747187614440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100617, + "balance_loss_mlp": 0.99477404, + "epoch": 0.8722585609849942, + "flos": 1495180560384.0, + "grad_norm": 0.008421507852118055, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81972969, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.11376953, + "step": 4534, + "time_per_iteration": 4.8016557693481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03764045, + "epoch": 0.8724509426702578, + "flos": 595885123584.0, + "grad_norm": 0.0532252497000328, + "language_loss": 0.86752987, + "learning_rate": 4.206065974853479e-05, + "loss": 0.87818944, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.28320312, + "step": 4535, + "time_per_iteration": 2.77604603767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_mlp": 1.03722727, + "epoch": 0.8726433243555214, + "flos": 443408767488.0, + "grad_norm": 0.3308825948130969, + "language_loss": 0.80913717, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81980032, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.29052734, + "step": 4536, + "time_per_iteration": 2.5680594444274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061952, + "balance_loss_mlp": 1.03381848, + "epoch": 0.8728357060407849, + "flos": 552919495680.0, + "grad_norm": 0.08036350588218866, + "language_loss": 0.82172877, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83234823, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.28149414, + "step": 4537, + "time_per_iteration": 2.6538937091827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063406, + "balance_loss_mlp": 1.03527319, + "epoch": 0.8730280877260485, + "flos": 627506251776.0, + "grad_norm": 0.12203372842904991, + "language_loss": 0.78675759, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79739171, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.28149414, + "step": 4538, + "time_per_iteration": 2.832740306854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065695, + "balance_loss_mlp": 1.03718042, + "epoch": 0.873220469411312, + "flos": 535106110464.0, + "grad_norm": 0.0606145940241532, + "language_loss": 0.80030394, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81096089, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.28540039, + "step": 4539, + "time_per_iteration": 2.7213377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065902, + "balance_loss_mlp": 1.03745842, + "epoch": 0.8734128510965756, + "flos": 561605723136.0, + "grad_norm": 0.07538210093780918, + "language_loss": 0.84122992, + "learning_rate": 4.143753177230242e-05, + "loss": 0.8518889, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.28442383, + "step": 4540, + "time_per_iteration": 2.6960151195526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_mlp": 1.03531361, + "epoch": 0.8736052327818392, + "flos": 686134761984.0, + "grad_norm": 0.05595611742808352, + "language_loss": 0.79501259, + "learning_rate": 4.131344007308224e-05, + "loss": 0.8056432, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.27783203, + "step": 4541, + "time_per_iteration": 3.0171802043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106403, + "balance_loss_mlp": 1.03544354, + "epoch": 0.8737976144671027, + "flos": 531384247296.0, + "grad_norm": 0.0683699884933183, + "language_loss": 0.81357133, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.8242116, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.28564453, + "step": 4542, + "time_per_iteration": 2.805901527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062407, + "balance_loss_mlp": 1.03434491, + "epoch": 0.8739899961523663, + "flos": 575308749312.0, + "grad_norm": 0.06249925001654303, + "language_loss": 0.81543392, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82605791, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.28051758, + "step": 4543, + "time_per_iteration": 2.851834774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065841, + "balance_loss_mlp": 1.03773165, + "epoch": 0.8741823778376299, + "flos": 731009373696.0, + "grad_norm": 0.06977869245266767, + "language_loss": 0.76428318, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77494162, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.28125, + "step": 4544, + "time_per_iteration": 2.925771713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066026, + "balance_loss_mlp": 1.03803515, + "epoch": 0.8743747595228935, + "flos": 566795639808.0, + "grad_norm": 0.07306890014877584, + "language_loss": 0.83605403, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84671426, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.2800293, + "step": 4545, + "time_per_iteration": 2.7709672451019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065089, + "balance_loss_mlp": 1.03702736, + "epoch": 0.8745671412081569, + "flos": 493115323392.0, + "grad_norm": 0.06204561243136525, + "language_loss": 0.82155466, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83220559, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.28076172, + "step": 4546, + "time_per_iteration": 2.565824270248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063172, + "balance_loss_mlp": 1.03525329, + "epoch": 0.8747595228934205, + "flos": 523883856384.0, + "grad_norm": 0.05603700784394243, + "language_loss": 0.83347672, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84410846, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.27929688, + "step": 4547, + "time_per_iteration": 2.639770746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067003, + "balance_loss_mlp": 1.03803515, + "epoch": 0.8749519045786841, + "flos": 743999427072.0, + "grad_norm": 0.061070440855238696, + "language_loss": 0.79543126, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80610132, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.28955078, + "step": 4548, + "time_per_iteration": 3.0035946369171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067764, + "balance_loss_mlp": 1.04046547, + "epoch": 0.8751442862639477, + "flos": 594003843072.0, + "grad_norm": 0.05314972905968755, + "language_loss": 0.79939222, + "learning_rate": 4.032712131660027e-05, + "loss": 0.8100698, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.2734375, + "step": 4549, + "time_per_iteration": 2.8230674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072257, + "balance_loss_mlp": 1.04369426, + "epoch": 0.8753366679492113, + "flos": 496285747200.0, + "grad_norm": 0.05669282479345713, + "language_loss": 0.78479946, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79552203, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.28564453, + "step": 4550, + "time_per_iteration": 2.7423791885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03803086, + "epoch": 0.8755290496344748, + "flos": 489619012608.0, + "grad_norm": 0.06250704180116129, + "language_loss": 0.81786513, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82852924, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.28344727, + "step": 4551, + "time_per_iteration": 2.567431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068494, + "balance_loss_mlp": 1.04014564, + "epoch": 0.8757214313197383, + "flos": 591557976576.0, + "grad_norm": 0.06565865323727363, + "language_loss": 0.81568277, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82636774, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.28344727, + "step": 4552, + "time_per_iteration": 2.8340234756469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_mlp": 1.03698468, + "epoch": 0.8759138130050019, + "flos": 976456249344.0, + "grad_norm": 0.06629709551141487, + "language_loss": 0.78052419, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.79117966, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.28540039, + "step": 4553, + "time_per_iteration": 3.2071568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065752, + "balance_loss_mlp": 1.0381906, + "epoch": 0.8761061946902655, + "flos": 802405357056.0, + "grad_norm": 0.05351335594773902, + "language_loss": 0.77677572, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78743327, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.27612305, + "step": 4554, + "time_per_iteration": 3.0603485107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_mlp": 1.04000342, + "epoch": 0.8762985763755291, + "flos": 698158738944.0, + "grad_norm": 0.05819539441060988, + "language_loss": 0.74314624, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75382471, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.27856445, + "step": 4555, + "time_per_iteration": 2.964545249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010664, + "balance_loss_mlp": 1.03843403, + "epoch": 0.8764909580607926, + "flos": 629416645632.0, + "grad_norm": 0.057392192725221856, + "language_loss": 0.80310047, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81376451, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.27978516, + "step": 4556, + "time_per_iteration": 2.91851806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106548, + "balance_loss_mlp": 1.03787112, + "epoch": 0.8766833397460562, + "flos": 481297959936.0, + "grad_norm": 0.05571794139590596, + "language_loss": 0.80284274, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81349754, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.27636719, + "step": 4557, + "time_per_iteration": 2.686192274093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067446, + "balance_loss_mlp": 1.03878832, + "epoch": 0.8768757214313198, + "flos": 407514336768.0, + "grad_norm": 0.06835264680371789, + "language_loss": 0.78205043, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79272485, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.28662109, + "step": 4558, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.03861248, + "epoch": 0.8770681031165833, + "flos": 582314517504.0, + "grad_norm": 0.057418492817462405, + "language_loss": 0.8179571, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.82862979, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.28637695, + "step": 4559, + "time_per_iteration": 2.6915462017059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069669, + "balance_loss_mlp": 1.04122567, + "epoch": 0.8772604848018468, + "flos": 508437762048.0, + "grad_norm": 0.05352883186200444, + "language_loss": 0.80551112, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81620783, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.28442383, + "step": 4560, + "time_per_iteration": 2.67244029045105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068433, + "balance_loss_mlp": 1.04027581, + "epoch": 0.8774528664871104, + "flos": 408617215488.0, + "grad_norm": 0.06358979412347537, + "language_loss": 0.84776622, + "learning_rate": 3.886906601970913e-05, + "loss": 0.85845059, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.28173828, + "step": 4561, + "time_per_iteration": 2.491192102432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069485, + "balance_loss_mlp": 1.04161429, + "epoch": 0.877645248172374, + "flos": 500589573120.0, + "grad_norm": 0.06737162506432262, + "language_loss": 0.83147556, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84217036, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.27880859, + "step": 4562, + "time_per_iteration": 2.684629440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066768, + "balance_loss_mlp": 1.03861117, + "epoch": 0.8778376298576376, + "flos": 632857701888.0, + "grad_norm": 0.059369689749274944, + "language_loss": 0.78097963, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79164732, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.28173828, + "step": 4563, + "time_per_iteration": 2.8923280239105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072367, + "balance_loss_mlp": 1.04442441, + "epoch": 0.8780300115429012, + "flos": 533707868160.0, + "grad_norm": 0.05558389562769292, + "language_loss": 0.80053449, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81125814, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.27954102, + "step": 4564, + "time_per_iteration": 2.823686361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069649, + "balance_loss_mlp": 1.04175413, + "epoch": 0.8782223932281646, + "flos": 511411746816.0, + "grad_norm": 0.05672637464801372, + "language_loss": 0.7727713, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78346777, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.27929688, + "step": 4565, + "time_per_iteration": 2.5881996154785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010687, + "balance_loss_mlp": 1.04049492, + "epoch": 0.8784147749134282, + "flos": 780333225984.0, + "grad_norm": 0.05732086037838532, + "language_loss": 0.69910938, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70979643, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.28222656, + "step": 4566, + "time_per_iteration": 3.1945879459381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03730834, + "epoch": 0.8786071565986918, + "flos": 560738571264.0, + "grad_norm": 0.06580168201373691, + "language_loss": 0.75722879, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76789016, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.28808594, + "step": 4567, + "time_per_iteration": 2.7322754859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070458, + "balance_loss_mlp": 1.04337335, + "epoch": 0.8787995382839554, + "flos": 603148377600.0, + "grad_norm": 0.05172796640220285, + "language_loss": 0.77077734, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78148192, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.27148438, + "step": 4568, + "time_per_iteration": 2.776970386505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068428, + "balance_loss_mlp": 1.04062867, + "epoch": 0.8789919199692189, + "flos": 559970343936.0, + "grad_norm": 0.059324275843292064, + "language_loss": 0.84837639, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85906065, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.27832031, + "step": 4569, + "time_per_iteration": 2.6863996982574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065069, + "balance_loss_mlp": 1.03810334, + "epoch": 0.8791843016544825, + "flos": 538857086976.0, + "grad_norm": 0.07165107779737093, + "language_loss": 0.81886643, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.82951707, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.27026367, + "step": 4570, + "time_per_iteration": 2.611616611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071174, + "balance_loss_mlp": 1.04273033, + "epoch": 0.8793766833397461, + "flos": 1008275226624.0, + "grad_norm": 0.0558494404755544, + "language_loss": 0.79366511, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80437684, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.28442383, + "step": 4571, + "time_per_iteration": 3.3873000144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073047, + "balance_loss_mlp": 1.04481804, + "epoch": 0.8795690650250096, + "flos": 678320068608.0, + "grad_norm": 0.056063442839823466, + "language_loss": 0.80823278, + "learning_rate": 3.755516016623628e-05, + "loss": 0.81896329, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.28222656, + "step": 4572, + "time_per_iteration": 2.893048048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066399, + "balance_loss_mlp": 1.03793228, + "epoch": 0.8797614467102732, + "flos": 453202255872.0, + "grad_norm": 0.06304464607757537, + "language_loss": 0.88333166, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89399564, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.28442383, + "step": 4573, + "time_per_iteration": 2.5377988815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.04367304, + "epoch": 0.8799538283955367, + "flos": 550649719296.0, + "grad_norm": 0.05552176218492619, + "language_loss": 0.84267652, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85338843, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.27563477, + "step": 4574, + "time_per_iteration": 2.6662659645080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068687, + "balance_loss_mlp": 1.04053009, + "epoch": 0.8801462100808003, + "flos": 807072947712.0, + "grad_norm": 0.05728401921436289, + "language_loss": 0.83839577, + "learning_rate": 3.720058989624681e-05, + "loss": 0.84908265, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.28173828, + "step": 4575, + "time_per_iteration": 3.076876640319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070952, + "balance_loss_mlp": 1.04296148, + "epoch": 0.8803385917660639, + "flos": 768366065664.0, + "grad_norm": 0.0517828102810866, + "language_loss": 0.84589469, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85660422, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.28027344, + "step": 4576, + "time_per_iteration": 2.9635534286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.04016733, + "epoch": 0.8805309734513275, + "flos": 567070654464.0, + "grad_norm": 0.053989075143044706, + "language_loss": 0.81054318, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82122689, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.28186035, + "step": 4577, + "time_per_iteration": 2.752592086791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069025, + "balance_loss_mlp": 1.04008079, + "epoch": 0.880723355136591, + "flos": 679481174016.0, + "grad_norm": 0.06232126145742502, + "language_loss": 0.81594551, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82663572, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.28979492, + "step": 4578, + "time_per_iteration": 2.814424991607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107019, + "balance_loss_mlp": 1.04248548, + "epoch": 0.8809157368218545, + "flos": 565347935232.0, + "grad_norm": 0.0557636314762692, + "language_loss": 0.78824782, + "learning_rate": 3.673034519424734e-05, + "loss": 0.79894972, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.27734375, + "step": 4579, + "time_per_iteration": 2.785956382751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071039, + "balance_loss_mlp": 1.04309607, + "epoch": 0.8811081185071181, + "flos": 515153958912.0, + "grad_norm": 0.05030651493634772, + "language_loss": 0.75700289, + "learning_rate": 3.661323354789586e-05, + "loss": 0.76771331, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.27954102, + "step": 4580, + "time_per_iteration": 2.6824047565460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_mlp": 1.04019618, + "epoch": 0.8813005001923817, + "flos": 594067862016.0, + "grad_norm": 0.07015298891450013, + "language_loss": 0.8114329, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82211691, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.28198242, + "step": 4581, + "time_per_iteration": 2.7086069583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062221, + "balance_loss_mlp": 1.03425419, + "epoch": 0.8814928818776453, + "flos": 666630743040.0, + "grad_norm": 0.05665802928284555, + "language_loss": 0.79123235, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80185449, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.27978516, + "step": 4582, + "time_per_iteration": 2.8371450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071423, + "balance_loss_mlp": 1.04393387, + "epoch": 0.8816852635629088, + "flos": 608873766912.0, + "grad_norm": 0.054118790146548024, + "language_loss": 0.8546508, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86536503, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.27514648, + "step": 4583, + "time_per_iteration": 2.717241048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067589, + "balance_loss_mlp": 1.03990829, + "epoch": 0.8818776452481724, + "flos": 480131062272.0, + "grad_norm": 0.05987245648604073, + "language_loss": 0.81967342, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83034927, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.27709961, + "step": 4584, + "time_per_iteration": 2.6413609981536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071593, + "balance_loss_mlp": 1.04243433, + "epoch": 0.882070026933436, + "flos": 1044985936896.0, + "grad_norm": 0.05789017209637249, + "language_loss": 0.73687112, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74758708, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.29125977, + "step": 4585, + "time_per_iteration": 3.304685592651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_mlp": 1.03928089, + "epoch": 0.8822624086186995, + "flos": 474153979392.0, + "grad_norm": 0.054265855941406795, + "language_loss": 0.79589009, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80655658, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.27441406, + "step": 4586, + "time_per_iteration": 2.697514533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_mlp": 1.04042411, + "epoch": 0.882454790303963, + "flos": 653725057536.0, + "grad_norm": 0.06096374939952472, + "language_loss": 0.81569088, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82637, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.27514648, + "step": 4587, + "time_per_iteration": 2.8024706840515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063916, + "balance_loss_mlp": 1.03544879, + "epoch": 0.8826471719892266, + "flos": 470081498112.0, + "grad_norm": 0.05869577114957185, + "language_loss": 0.78408635, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79472554, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.28442383, + "step": 4588, + "time_per_iteration": 2.578707695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068701, + "balance_loss_mlp": 1.04040098, + "epoch": 0.8828395536744902, + "flos": 468501373440.0, + "grad_norm": 0.05231035203284282, + "language_loss": 0.83738208, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84806907, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.28295898, + "step": 4589, + "time_per_iteration": 2.68972110748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065866, + "balance_loss_mlp": 1.03809047, + "epoch": 0.8830319353597538, + "flos": 721044177408.0, + "grad_norm": 0.08332250868993829, + "language_loss": 0.81341159, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82407022, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.27783203, + "step": 4590, + "time_per_iteration": 2.9407219886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107042, + "balance_loss_mlp": 1.04295468, + "epoch": 0.8832243170450174, + "flos": 443049384960.0, + "grad_norm": 0.0678203863525127, + "language_loss": 0.81142139, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82212561, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.2746582, + "step": 4591, + "time_per_iteration": 2.568373680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067355, + "balance_loss_mlp": 1.03934026, + "epoch": 0.8834166987302808, + "flos": 566293662720.0, + "grad_norm": 0.06220789514953692, + "language_loss": 0.81893933, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.82961291, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.28027344, + "step": 4592, + "time_per_iteration": 2.756255626678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067482, + "balance_loss_mlp": 1.03975368, + "epoch": 0.8836090804155444, + "flos": 609022153728.0, + "grad_norm": 0.07096792150900852, + "language_loss": 0.81740928, + "learning_rate": 3.510716974532352e-05, + "loss": 0.82808411, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.27734375, + "step": 4593, + "time_per_iteration": 2.7616682052612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.04020929, + "epoch": 0.883801462100808, + "flos": 556804302336.0, + "grad_norm": 0.06039187959757844, + "language_loss": 0.80636853, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81705528, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.28491211, + "step": 4594, + "time_per_iteration": 2.658634662628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068628, + "balance_loss_mlp": 1.03965974, + "epoch": 0.8839938437860716, + "flos": 515936742912.0, + "grad_norm": 0.07933366210250277, + "language_loss": 0.77274346, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78342974, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.28955078, + "step": 4595, + "time_per_iteration": 2.599109649658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064003, + "balance_loss_mlp": 1.03620315, + "epoch": 0.8841862254713351, + "flos": 713386635264.0, + "grad_norm": 0.06401274650303065, + "language_loss": 0.7867049, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79734492, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.27807617, + "step": 4596, + "time_per_iteration": 2.8454713821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067855, + "balance_loss_mlp": 1.04062724, + "epoch": 0.8843786071565987, + "flos": 833626404864.0, + "grad_norm": 0.07003048981837431, + "language_loss": 0.82647777, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8371563, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.27270508, + "step": 4597, + "time_per_iteration": 3.039944887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069696, + "balance_loss_mlp": 1.04196787, + "epoch": 0.8845709888418622, + "flos": 656562240000.0, + "grad_norm": 0.04759555258989633, + "language_loss": 0.82870215, + "learning_rate": 3.453603099349462e-05, + "loss": 0.8393991, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.27734375, + "step": 4598, + "time_per_iteration": 2.924360513687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_mlp": 1.03937411, + "epoch": 0.8847633705271258, + "flos": 523038463488.0, + "grad_norm": 0.0554469987198936, + "language_loss": 0.81217462, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82284564, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.27734375, + "step": 4599, + "time_per_iteration": 2.7385177612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069138, + "balance_loss_mlp": 1.04188693, + "epoch": 0.8849557522123894, + "flos": 548330480640.0, + "grad_norm": 0.0620233262866808, + "language_loss": 0.84279031, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85348165, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.27246094, + "step": 4600, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069344, + "balance_loss_mlp": 1.04185414, + "epoch": 0.8851481338976529, + "flos": 622070433792.0, + "grad_norm": 0.051582270147677196, + "language_loss": 0.83688784, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84758127, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.27563477, + "step": 4601, + "time_per_iteration": 2.8480563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067523, + "balance_loss_mlp": 1.03929448, + "epoch": 0.8853405155829165, + "flos": 444123150336.0, + "grad_norm": 0.056068366837892174, + "language_loss": 0.80678725, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81746256, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.2824707, + "step": 4602, + "time_per_iteration": 2.5683131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065058, + "balance_loss_mlp": 1.03682971, + "epoch": 0.8855328972681801, + "flos": 730152396288.0, + "grad_norm": 0.05740540609560926, + "language_loss": 0.77796984, + "learning_rate": 3.396940996663683e-05, + "loss": 0.78862035, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.28222656, + "step": 4603, + "time_per_iteration": 2.897857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067098, + "balance_loss_mlp": 1.03936982, + "epoch": 0.8857252789534437, + "flos": 487132448256.0, + "grad_norm": 0.058129014822259635, + "language_loss": 0.79058081, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80125177, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.27758789, + "step": 4604, + "time_per_iteration": 2.5698628425598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070359, + "balance_loss_mlp": 1.04284549, + "epoch": 0.8859176606387072, + "flos": 508290785280.0, + "grad_norm": 0.05786101716363267, + "language_loss": 0.81376195, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82446557, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.27563477, + "step": 4605, + "time_per_iteration": 2.6911301612854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069989, + "balance_loss_mlp": 1.04142654, + "epoch": 0.8861100423239707, + "flos": 516370318848.0, + "grad_norm": 0.054307106950923195, + "language_loss": 0.85590959, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86660945, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.28588867, + "step": 4606, + "time_per_iteration": 2.6767466068267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107098, + "balance_loss_mlp": 1.04394376, + "epoch": 0.8863024240092343, + "flos": 626692944384.0, + "grad_norm": 0.0544909967817947, + "language_loss": 0.79524022, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80595005, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.27075195, + "step": 4607, + "time_per_iteration": 2.716878652572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.0418644, + "epoch": 0.8864948056944979, + "flos": 766564770816.0, + "grad_norm": 0.1476541452919149, + "language_loss": 0.83357704, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84427702, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.28173828, + "step": 4608, + "time_per_iteration": 2.9715864658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021373, + "balance_loss_mlp": 1.00988162, + "epoch": 0.8866871873797615, + "flos": 1501500907008.0, + "grad_norm": 0.01167151488770453, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79852331, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.11474609, + "step": 4609, + "time_per_iteration": 4.834856748580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_mlp": 1.04015195, + "epoch": 0.886879569065025, + "flos": 811164367872.0, + "grad_norm": 0.08488688908533946, + "language_loss": 0.81698787, + "learning_rate": 3.3183740769755e-05, + "loss": 0.82766908, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.27978516, + "step": 4610, + "time_per_iteration": 3.034174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020022, + "balance_loss_mlp": 1.0085299, + "epoch": 0.8870719507502886, + "flos": 1581994934784.0, + "grad_norm": 0.010974826258400936, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77930856, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.11474609, + "step": 4611, + "time_per_iteration": 4.9730494022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069377, + "balance_loss_mlp": 1.04114866, + "epoch": 0.8872643324355521, + "flos": 633743792640.0, + "grad_norm": 0.0611353937593657, + "language_loss": 0.75024319, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76093698, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.2824707, + "step": 4612, + "time_per_iteration": 2.746696710586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.04214144, + "epoch": 0.8874567141208157, + "flos": 535498988544.0, + "grad_norm": 0.06235030961125674, + "language_loss": 0.82855523, + "learning_rate": 3.284974304209532e-05, + "loss": 0.83925247, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.27636719, + "step": 4613, + "time_per_iteration": 2.637052536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067959, + "balance_loss_mlp": 1.03958726, + "epoch": 0.8876490958060793, + "flos": 1565700931584.0, + "grad_norm": 0.06402411256852786, + "language_loss": 0.7942912, + "learning_rate": 3.27387731362766e-05, + "loss": 0.8049708, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.28369141, + "step": 4614, + "time_per_iteration": 3.923633575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.03921044, + "epoch": 0.8878414774913428, + "flos": 636343838208.0, + "grad_norm": 0.05096135201935837, + "language_loss": 0.85021508, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86089039, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.28344727, + "step": 4615, + "time_per_iteration": 2.779921054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063179, + "balance_loss_mlp": 1.03480697, + "epoch": 0.8880338591766064, + "flos": 496182440448.0, + "grad_norm": 0.06545341443379886, + "language_loss": 0.81585425, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82648605, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.28393555, + "step": 4616, + "time_per_iteration": 2.616635322570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071314, + "balance_loss_mlp": 1.04289412, + "epoch": 0.88822624086187, + "flos": 542599299072.0, + "grad_norm": 0.057599266414533334, + "language_loss": 0.79628587, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80699897, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.28393555, + "step": 4617, + "time_per_iteration": 2.6929566860198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065504, + "balance_loss_mlp": 1.03744173, + "epoch": 0.8884186225471336, + "flos": 551560541184.0, + "grad_norm": 0.06545285558813568, + "language_loss": 0.84187359, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85252863, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.28076172, + "step": 4618, + "time_per_iteration": 2.689328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018165, + "balance_loss_mlp": 1.00662541, + "epoch": 0.888611004232397, + "flos": 1564417276416.0, + "grad_norm": 0.008722298990841466, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79530358, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.11523438, + "step": 4619, + "time_per_iteration": 5.016630172729492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_mlp": 1.04338467, + "epoch": 0.8888033859176606, + "flos": 766678252032.0, + "grad_norm": 0.0582454521799534, + "language_loss": 0.82567924, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83639133, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.27856445, + "step": 4620, + "time_per_iteration": 3.0133860111236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.04040623, + "epoch": 0.8889957676029242, + "flos": 933727758336.0, + "grad_norm": 0.05884213021471634, + "language_loss": 0.83990335, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85058427, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.27758789, + "step": 4621, + "time_per_iteration": 3.167980670928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068617, + "balance_loss_mlp": 1.04069793, + "epoch": 0.8891881492881878, + "flos": 589317313536.0, + "grad_norm": 0.09170475766074285, + "language_loss": 0.81411701, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82480323, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.27954102, + "step": 4622, + "time_per_iteration": 2.7863264083862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067138, + "balance_loss_mlp": 1.03888595, + "epoch": 0.8893805309734514, + "flos": 540438621696.0, + "grad_norm": 0.06461743401993036, + "language_loss": 0.82403553, + "learning_rate": 3.174821244088466e-05, + "loss": 0.8347069, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.2824707, + "step": 4623, + "time_per_iteration": 2.731494903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106871, + "balance_loss_mlp": 1.04057622, + "epoch": 0.8895729126587149, + "flos": 559827749376.0, + "grad_norm": 0.06194328064505052, + "language_loss": 0.81727606, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82796311, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.28173828, + "step": 4624, + "time_per_iteration": 2.7517242431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070888, + "balance_loss_mlp": 1.04284978, + "epoch": 0.8897652943439784, + "flos": 609873338880.0, + "grad_norm": 0.050549853414559504, + "language_loss": 0.8105303, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82123923, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.28015137, + "step": 4625, + "time_per_iteration": 2.775944948196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068366, + "balance_loss_mlp": 1.0394454, + "epoch": 0.889957676029242, + "flos": 917455209984.0, + "grad_norm": 0.06186267612969521, + "language_loss": 0.7697686, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78045225, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.2890625, + "step": 4626, + "time_per_iteration": 3.221770763397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067678, + "balance_loss_mlp": 1.03944921, + "epoch": 0.8901500577145056, + "flos": 488452114944.0, + "grad_norm": 0.0559809171048545, + "language_loss": 0.80048203, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81115878, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.28222656, + "step": 4627, + "time_per_iteration": 2.563779354095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068508, + "balance_loss_mlp": 1.04042268, + "epoch": 0.8903424393997691, + "flos": 733332994560.0, + "grad_norm": 0.0592114438847255, + "language_loss": 0.80764806, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81833315, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.28125, + "step": 4628, + "time_per_iteration": 2.9863662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066862, + "balance_loss_mlp": 1.0390867, + "epoch": 0.8905348210850327, + "flos": 519546534912.0, + "grad_norm": 0.05729478499656057, + "language_loss": 0.81872827, + "learning_rate": 3.109601733496881e-05, + "loss": 0.8293969, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.27783203, + "step": 4629, + "time_per_iteration": 2.655174970626831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064198, + "balance_loss_mlp": 1.03639805, + "epoch": 0.8907272027702963, + "flos": 578672640000.0, + "grad_norm": 0.052285049581706246, + "language_loss": 0.79457366, + "learning_rate": 3.098795506144458e-05, + "loss": 0.8052156, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.27832031, + "step": 4630, + "time_per_iteration": 2.840730667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067701, + "balance_loss_mlp": 1.04030657, + "epoch": 0.8909195844555599, + "flos": 893258869248.0, + "grad_norm": 0.059465272064999686, + "language_loss": 0.79709071, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80776775, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.27441406, + "step": 4631, + "time_per_iteration": 3.1187219619750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.0419575, + "epoch": 0.8911119661408234, + "flos": 549596302848.0, + "grad_norm": 0.05514247139292472, + "language_loss": 0.84210968, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85281444, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.28540039, + "step": 4632, + "time_per_iteration": 2.695281505584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070837, + "balance_loss_mlp": 1.04258442, + "epoch": 0.8913043478260869, + "flos": 480884732928.0, + "grad_norm": 0.07098805305903529, + "language_loss": 0.83367896, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84438735, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.2824707, + "step": 4633, + "time_per_iteration": 2.674928665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067506, + "balance_loss_mlp": 1.03934908, + "epoch": 0.8914967295113505, + "flos": 484317024768.0, + "grad_norm": 0.05060462387255462, + "language_loss": 0.85151595, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86219102, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.28149414, + "step": 4634, + "time_per_iteration": 2.6783857345581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069245, + "balance_loss_mlp": 1.04096866, + "epoch": 0.8916891111966141, + "flos": 445432642560.0, + "grad_norm": 0.05975693569548975, + "language_loss": 0.81291115, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82360363, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.28295898, + "step": 4635, + "time_per_iteration": 2.5965147018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068237, + "balance_loss_mlp": 1.04027081, + "epoch": 0.8918814928818777, + "flos": 563751843840.0, + "grad_norm": 0.049564759273153264, + "language_loss": 0.78083277, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79151511, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.27978516, + "step": 4636, + "time_per_iteration": 2.771043062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.03971982, + "epoch": 0.8920738745671412, + "flos": 575672514048.0, + "grad_norm": 0.06662483831427483, + "language_loss": 0.80982053, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82049739, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.2800293, + "step": 4637, + "time_per_iteration": 2.6836137771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067244, + "balance_loss_mlp": 1.04018307, + "epoch": 0.8922662562524047, + "flos": 619898171904.0, + "grad_norm": 0.055977453987363854, + "language_loss": 0.84088302, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85155547, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.27124023, + "step": 4638, + "time_per_iteration": 2.7201523780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.0383637, + "epoch": 0.8924586379376683, + "flos": 583330056192.0, + "grad_norm": 0.06318368415584479, + "language_loss": 0.7920469, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80270731, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.27709961, + "step": 4639, + "time_per_iteration": 2.7709860801696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_mlp": 1.04509687, + "epoch": 0.8926510196229319, + "flos": 524922716160.0, + "grad_norm": 0.043960833871696636, + "language_loss": 0.81677014, + "learning_rate": 2.991735397786538e-05, + "loss": 0.827492, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.27148438, + "step": 4640, + "time_per_iteration": 2.8300883769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_mlp": 1.04016221, + "epoch": 0.8928434013081955, + "flos": 486428239872.0, + "grad_norm": 0.06172673252481555, + "language_loss": 0.80732042, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81800508, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.28320312, + "step": 4641, + "time_per_iteration": 2.5496692657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015529, + "balance_loss_mlp": 1.00403714, + "epoch": 0.893035782993459, + "flos": 1447580837376.0, + "grad_norm": 0.007735138982934367, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81346381, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.11474609, + "step": 4642, + "time_per_iteration": 4.7214789390563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071184, + "balance_loss_mlp": 1.04440916, + "epoch": 0.8932281646787226, + "flos": 611040236544.0, + "grad_norm": 0.06230768103154438, + "language_loss": 0.80826664, + "learning_rate": 2.95997305629786e-05, + "loss": 0.81897843, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.26794434, + "step": 4643, + "time_per_iteration": 2.776540756225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070751, + "balance_loss_mlp": 1.04316592, + "epoch": 0.8934205463639862, + "flos": 565494912000.0, + "grad_norm": 0.0560545196954126, + "language_loss": 0.84422594, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85493338, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.27636719, + "step": 4644, + "time_per_iteration": 2.64113450050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068593, + "balance_loss_mlp": 1.04084074, + "epoch": 0.8936129280492497, + "flos": 488181482496.0, + "grad_norm": 0.07046093085577981, + "language_loss": 0.77728665, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.78797263, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.27807617, + "step": 4645, + "time_per_iteration": 2.557194232940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070503, + "balance_loss_mlp": 1.04298949, + "epoch": 0.8938053097345132, + "flos": 886095949824.0, + "grad_norm": 0.05665952535342083, + "language_loss": 0.80841428, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81911927, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.27514648, + "step": 4646, + "time_per_iteration": 3.204979181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069223, + "balance_loss_mlp": 1.04173374, + "epoch": 0.8939976914197768, + "flos": 592999888896.0, + "grad_norm": 0.057759153184874165, + "language_loss": 0.84277451, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85346675, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.27539062, + "step": 4647, + "time_per_iteration": 2.7343509197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067768, + "balance_loss_mlp": 1.03944361, + "epoch": 0.8941900731050404, + "flos": 522983208960.0, + "grad_norm": 0.059380857059797024, + "language_loss": 0.80891001, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.81958771, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.28320312, + "step": 4648, + "time_per_iteration": 2.6367506980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066382, + "balance_loss_mlp": 1.03853464, + "epoch": 0.894382454790304, + "flos": 800247651840.0, + "grad_norm": 0.05523040639644092, + "language_loss": 0.81081837, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82148218, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.27856445, + "step": 4649, + "time_per_iteration": 3.030062198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065779, + "balance_loss_mlp": 1.03788459, + "epoch": 0.8945748364755676, + "flos": 478782282240.0, + "grad_norm": 0.06268111355606142, + "language_loss": 0.84919488, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85985267, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.27929688, + "step": 4650, + "time_per_iteration": 2.59285569190979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068569, + "balance_loss_mlp": 1.04065061, + "epoch": 0.894767218160831, + "flos": 508507573248.0, + "grad_norm": 0.07694794065746953, + "language_loss": 0.82904601, + "learning_rate": 2.876077330953042e-05, + "loss": 0.83973163, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.27929688, + "step": 4651, + "time_per_iteration": 2.6908295154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070543, + "balance_loss_mlp": 1.04274344, + "epoch": 0.8949595998460946, + "flos": 685557181440.0, + "grad_norm": 0.05647102417455385, + "language_loss": 0.81656528, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82727075, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.27807617, + "step": 4652, + "time_per_iteration": 2.8482308387756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.03751302, + "epoch": 0.8951519815313582, + "flos": 799578349056.0, + "grad_norm": 0.057302160059149884, + "language_loss": 0.77321589, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78387833, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.28735352, + "step": 4653, + "time_per_iteration": 3.002678632736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066194, + "balance_loss_mlp": 1.0369159, + "epoch": 0.8953443632166218, + "flos": 666443068416.0, + "grad_norm": 0.059263332900696505, + "language_loss": 0.86105883, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87172079, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.29272461, + "step": 4654, + "time_per_iteration": 2.810746908187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062957, + "balance_loss_mlp": 1.0353719, + "epoch": 0.8955367449018854, + "flos": 644670683136.0, + "grad_norm": 0.05592703355481017, + "language_loss": 0.83190131, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84253091, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.27636719, + "step": 4655, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067585, + "balance_loss_mlp": 1.03911805, + "epoch": 0.8957291265871489, + "flos": 808353326592.0, + "grad_norm": 0.06040680854300063, + "language_loss": 0.77388299, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78455889, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.28442383, + "step": 4656, + "time_per_iteration": 3.0514414310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066606, + "balance_loss_mlp": 1.0381391, + "epoch": 0.8959215082724125, + "flos": 518664826368.0, + "grad_norm": 0.05687998716555397, + "language_loss": 0.76916766, + "learning_rate": 2.813923817903391e-05, + "loss": 0.77983367, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.28515625, + "step": 4657, + "time_per_iteration": 2.6414825916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_mlp": 1.03921711, + "epoch": 0.896113889957676, + "flos": 476669657088.0, + "grad_norm": 0.0528545629927777, + "language_loss": 0.77033144, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78100324, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.28027344, + "step": 4658, + "time_per_iteration": 2.6311020851135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067219, + "balance_loss_mlp": 1.03827536, + "epoch": 0.8963062716429396, + "flos": 517911155712.0, + "grad_norm": 0.05569810882559681, + "language_loss": 0.83101171, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84168386, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.28955078, + "step": 4659, + "time_per_iteration": 2.645664930343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065748, + "balance_loss_mlp": 1.03768659, + "epoch": 0.8964986533282031, + "flos": 508231148544.0, + "grad_norm": 0.06041289923786583, + "language_loss": 0.8144539, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82511139, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.28076172, + "step": 4660, + "time_per_iteration": 2.699507713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066914, + "balance_loss_mlp": 1.03849435, + "epoch": 0.8966910350134667, + "flos": 535819083264.0, + "grad_norm": 0.0649780627361528, + "language_loss": 0.80980611, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82047522, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.28417969, + "step": 4661, + "time_per_iteration": 2.6589531898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066329, + "balance_loss_mlp": 1.03771877, + "epoch": 0.8968834166987303, + "flos": 722909491200.0, + "grad_norm": 0.05835442407396343, + "language_loss": 0.84337735, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85404074, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.28588867, + "step": 4662, + "time_per_iteration": 2.8895277976989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064424, + "balance_loss_mlp": 1.03702998, + "epoch": 0.8970757983839939, + "flos": 681372628992.0, + "grad_norm": 0.049817716882638224, + "language_loss": 0.83679664, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84744084, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.27392578, + "step": 4663, + "time_per_iteration": 2.90505313873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065764, + "balance_loss_mlp": 1.03763032, + "epoch": 0.8972681800692575, + "flos": 612758573568.0, + "grad_norm": 0.07386226147596868, + "language_loss": 0.75563216, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76628977, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.28173828, + "step": 4664, + "time_per_iteration": 2.7474899291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106565, + "balance_loss_mlp": 1.03689647, + "epoch": 0.8974605617545209, + "flos": 572064132096.0, + "grad_norm": 0.05719601329646282, + "language_loss": 0.8326844, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84334087, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.28735352, + "step": 4665, + "time_per_iteration": 2.728703737258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064005, + "balance_loss_mlp": 1.03525186, + "epoch": 0.8976529434397845, + "flos": 520147436544.0, + "grad_norm": 0.055681345294375295, + "language_loss": 0.87152803, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88216805, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.28735352, + "step": 4666, + "time_per_iteration": 2.663864850997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066696, + "balance_loss_mlp": 1.03911066, + "epoch": 0.8978453251250481, + "flos": 471124740096.0, + "grad_norm": 0.08211330217280415, + "language_loss": 0.82403785, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83470482, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.27612305, + "step": 4667, + "time_per_iteration": 2.6915175914764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068319, + "balance_loss_mlp": 1.04042363, + "epoch": 0.8980377068103117, + "flos": 591370301952.0, + "grad_norm": 0.11004700264832698, + "language_loss": 0.81857389, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.82925701, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.27929688, + "step": 4668, + "time_per_iteration": 2.777339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069369, + "balance_loss_mlp": 1.04102135, + "epoch": 0.8982300884955752, + "flos": 767287918080.0, + "grad_norm": 0.05198746071964672, + "language_loss": 0.82804859, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83874226, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.28344727, + "step": 4669, + "time_per_iteration": 2.9581100940704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068748, + "balance_loss_mlp": 1.040519, + "epoch": 0.8984224701808388, + "flos": 844189530624.0, + "grad_norm": 0.06146517202916762, + "language_loss": 0.77403522, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78472269, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.2824707, + "step": 4670, + "time_per_iteration": 3.214451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106956, + "balance_loss_mlp": 1.04204607, + "epoch": 0.8986148518661023, + "flos": 757303782912.0, + "grad_norm": 0.05628437855404085, + "language_loss": 0.76025915, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77095473, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.27539062, + "step": 4671, + "time_per_iteration": 3.131769895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_mlp": 1.03580141, + "epoch": 0.8988072335513659, + "flos": 562801734144.0, + "grad_norm": 0.05509278789905922, + "language_loss": 0.76818681, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77883279, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.28808594, + "step": 4672, + "time_per_iteration": 2.6790685653686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03874218, + "epoch": 0.8989996152366295, + "flos": 492440228352.0, + "grad_norm": 0.05781833096517719, + "language_loss": 0.86723161, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87789226, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.2734375, + "step": 4673, + "time_per_iteration": 2.5404884815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066669, + "balance_loss_mlp": 1.03865457, + "epoch": 0.899191996921893, + "flos": 542303935488.0, + "grad_norm": 0.0639009848289485, + "language_loss": 0.75673521, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.76740181, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.28027344, + "step": 4674, + "time_per_iteration": 2.6745707988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066722, + "balance_loss_mlp": 1.03916073, + "epoch": 0.8993843786071566, + "flos": 471081070080.0, + "grad_norm": 0.05958404754424956, + "language_loss": 0.79837209, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80903935, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.27612305, + "step": 4675, + "time_per_iteration": 2.5648069381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062537, + "balance_loss_mlp": 1.03449929, + "epoch": 0.8995767602924202, + "flos": 526454788608.0, + "grad_norm": 0.058069364250127556, + "language_loss": 0.82527149, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83589685, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.28051758, + "step": 4676, + "time_per_iteration": 2.7322497367858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_mlp": 1.0367316, + "epoch": 0.8997691419776838, + "flos": 557365916160.0, + "grad_norm": 0.05633383072499603, + "language_loss": 0.84505248, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85570467, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.28491211, + "step": 4677, + "time_per_iteration": 2.6816530227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069952, + "balance_loss_mlp": 1.04186583, + "epoch": 0.8999615236629472, + "flos": 638722713600.0, + "grad_norm": 0.058231914931515895, + "language_loss": 0.80479538, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81549489, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.28076172, + "step": 4678, + "time_per_iteration": 2.8522558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014105, + "balance_loss_mlp": 1.002756, + "epoch": 0.9001539053482108, + "flos": 1430743703040.0, + "grad_norm": 0.00862832057213614, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86798131, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.11328125, + "step": 4679, + "time_per_iteration": 4.799229860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064336, + "balance_loss_mlp": 1.0357976, + "epoch": 0.9003462870334744, + "flos": 566589026304.0, + "grad_norm": 0.0588723941053944, + "language_loss": 0.80009788, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81074125, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.28564453, + "step": 4680, + "time_per_iteration": 2.879063606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064946, + "balance_loss_mlp": 1.03712296, + "epoch": 0.900538668718738, + "flos": 538394397696.0, + "grad_norm": 0.0697908395177343, + "language_loss": 0.7863133, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79696274, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.27832031, + "step": 4681, + "time_per_iteration": 2.651343584060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066763, + "balance_loss_mlp": 1.03867733, + "epoch": 0.9007310504040016, + "flos": 488146576896.0, + "grad_norm": 0.06354903246037491, + "language_loss": 0.8607623, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87142992, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.28076172, + "step": 4682, + "time_per_iteration": 2.546644449234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064911, + "balance_loss_mlp": 1.0366112, + "epoch": 0.9009234320892651, + "flos": 652593065472.0, + "grad_norm": 0.06043677066691621, + "language_loss": 0.78744268, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79809177, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.28320312, + "step": 4683, + "time_per_iteration": 2.8712375164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068774, + "balance_loss_mlp": 1.04054499, + "epoch": 0.9011158137745287, + "flos": 545302651392.0, + "grad_norm": 0.05161746499741545, + "language_loss": 0.85312754, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86381531, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.2824707, + "step": 4684, + "time_per_iteration": 2.6262335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.03588235, + "epoch": 0.9013081954597922, + "flos": 559429079040.0, + "grad_norm": 0.06544642746870727, + "language_loss": 0.82523555, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83586979, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.27563477, + "step": 4685, + "time_per_iteration": 2.6898350715637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066665, + "balance_loss_mlp": 1.03848374, + "epoch": 0.9015005771450558, + "flos": 728330752512.0, + "grad_norm": 0.04796674200603937, + "language_loss": 0.8107928, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82145953, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.28173828, + "step": 4686, + "time_per_iteration": 2.9521684646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069741, + "balance_loss_mlp": 1.04172671, + "epoch": 0.9016929588303193, + "flos": 517148720640.0, + "grad_norm": 0.05256226767629222, + "language_loss": 0.8077606, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.81845802, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.2800293, + "step": 4687, + "time_per_iteration": 2.84675669670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_mlp": 1.04000878, + "epoch": 0.9018853405155829, + "flos": 622031145984.0, + "grad_norm": 0.04949583001041346, + "language_loss": 0.8596499, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87032723, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.27734375, + "step": 4688, + "time_per_iteration": 2.883434534072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063716, + "balance_loss_mlp": 1.03551149, + "epoch": 0.9020777222008465, + "flos": 523022496768.0, + "grad_norm": 0.05482141018442068, + "language_loss": 0.77574694, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78638411, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.28198242, + "step": 4689, + "time_per_iteration": 2.601212739944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063021, + "balance_loss_mlp": 1.03581715, + "epoch": 0.9022701038861101, + "flos": 633419315712.0, + "grad_norm": 0.04920852715445459, + "language_loss": 0.81768286, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82831311, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.27270508, + "step": 4690, + "time_per_iteration": 2.818192481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067502, + "balance_loss_mlp": 1.03984523, + "epoch": 0.9024624855713737, + "flos": 513036951552.0, + "grad_norm": 0.055226262822308456, + "language_loss": 0.84412956, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85480458, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.27685547, + "step": 4691, + "time_per_iteration": 2.6131467819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069251, + "balance_loss_mlp": 1.04195166, + "epoch": 0.9026548672566371, + "flos": 477152695296.0, + "grad_norm": 0.05811897986593017, + "language_loss": 0.86162984, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87232238, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.27368164, + "step": 4692, + "time_per_iteration": 2.6586062908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106917, + "balance_loss_mlp": 1.04120314, + "epoch": 0.9028472489419007, + "flos": 661701284352.0, + "grad_norm": 0.1426451694737163, + "language_loss": 0.73884237, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74953413, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.28027344, + "step": 4693, + "time_per_iteration": 2.838524580001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067774, + "balance_loss_mlp": 1.0398314, + "epoch": 0.9030396306271643, + "flos": 534332090880.0, + "grad_norm": 0.05783209602584723, + "language_loss": 0.81908751, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.8297652, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.27978516, + "step": 4694, + "time_per_iteration": 2.674063205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070492, + "balance_loss_mlp": 1.04264498, + "epoch": 0.9032320123124279, + "flos": 800695784448.0, + "grad_norm": 0.05843241181066569, + "language_loss": 0.82359844, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83430338, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.27856445, + "step": 4695, + "time_per_iteration": 2.9802277088165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067562, + "balance_loss_mlp": 1.03997672, + "epoch": 0.9034243939976914, + "flos": 553673166336.0, + "grad_norm": 0.0750205070767636, + "language_loss": 0.76441383, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77508944, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.27636719, + "step": 4696, + "time_per_iteration": 2.6411380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069995, + "balance_loss_mlp": 1.0430541, + "epoch": 0.903616775682955, + "flos": 503656690176.0, + "grad_norm": 0.05696319477889627, + "language_loss": 0.82816821, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83886814, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.26977539, + "step": 4697, + "time_per_iteration": 2.603687286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070428, + "balance_loss_mlp": 1.04215193, + "epoch": 0.9038091573682185, + "flos": 436058173440.0, + "grad_norm": 0.05485008828996457, + "language_loss": 0.78603637, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79674065, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.28295898, + "step": 4698, + "time_per_iteration": 2.638568162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067091, + "balance_loss_mlp": 1.03898168, + "epoch": 0.9040015390534821, + "flos": 532648659456.0, + "grad_norm": 0.06473187414525833, + "language_loss": 0.81159961, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82227051, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.28125, + "step": 4699, + "time_per_iteration": 2.6460814476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062513, + "balance_loss_mlp": 1.034904, + "epoch": 0.9041939207387457, + "flos": 564028268544.0, + "grad_norm": 0.06800466298182273, + "language_loss": 0.80023026, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81085545, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.27661133, + "step": 4700, + "time_per_iteration": 2.7773025035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071491, + "balance_loss_mlp": 1.04259431, + "epoch": 0.9043863024240092, + "flos": 515257265664.0, + "grad_norm": 0.06969733527966859, + "language_loss": 0.77433765, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78505254, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.2890625, + "step": 4701, + "time_per_iteration": 2.6053738594055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017578, + "balance_loss_mlp": 1.00627708, + "epoch": 0.9045786841092728, + "flos": 1277243043840.0, + "grad_norm": 0.006832227810148578, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73947364, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.11279297, + "step": 4702, + "time_per_iteration": 4.971631288528442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.04062688, + "epoch": 0.9047710657945364, + "flos": 585569309184.0, + "grad_norm": 0.06435638379504488, + "language_loss": 0.82813382, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83881855, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.27856445, + "step": 4703, + "time_per_iteration": 2.6877286434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065706, + "balance_loss_mlp": 1.03797805, + "epoch": 0.9049634474798, + "flos": 571655287296.0, + "grad_norm": 0.0636995704600701, + "language_loss": 0.79728121, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80793828, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.27758789, + "step": 4704, + "time_per_iteration": 2.728874444961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066082, + "balance_loss_mlp": 1.0389502, + "epoch": 0.9051558291650635, + "flos": 572353703424.0, + "grad_norm": 0.07413050035901024, + "language_loss": 0.74390441, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75456524, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.27172852, + "step": 4705, + "time_per_iteration": 2.6595373153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.04249048, + "epoch": 0.905348210850327, + "flos": 540280060416.0, + "grad_norm": 0.06136999404791905, + "language_loss": 0.7903558, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80105507, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.2746582, + "step": 4706, + "time_per_iteration": 2.6509690284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072106, + "balance_loss_mlp": 1.04435396, + "epoch": 0.9055405925355906, + "flos": 516128799744.0, + "grad_norm": 0.06082277115431439, + "language_loss": 0.81760788, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82832897, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.27758789, + "step": 4707, + "time_per_iteration": 2.5796375274658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068736, + "balance_loss_mlp": 1.04031706, + "epoch": 0.9057329742208542, + "flos": 914249880576.0, + "grad_norm": 0.40882184585938774, + "language_loss": 0.84702176, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.85770917, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.28393555, + "step": 4708, + "time_per_iteration": 3.1743359565734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_mlp": 1.03798819, + "epoch": 0.9059253559061178, + "flos": 904884175872.0, + "grad_norm": 0.056239877647307326, + "language_loss": 0.82753253, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.83818728, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.27490234, + "step": 4709, + "time_per_iteration": 3.1511998176574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106885, + "balance_loss_mlp": 1.04059744, + "epoch": 0.9061177375913813, + "flos": 664218372096.0, + "grad_norm": 0.060791344660506334, + "language_loss": 0.77237535, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78306377, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.28222656, + "step": 4710, + "time_per_iteration": 2.859652519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069474, + "balance_loss_mlp": 1.04150796, + "epoch": 0.9063101192766448, + "flos": 565318821888.0, + "grad_norm": 0.057932581818472106, + "language_loss": 0.82433301, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83502775, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.2800293, + "step": 4711, + "time_per_iteration": 2.746431589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065463, + "balance_loss_mlp": 1.03825986, + "epoch": 0.9065025009619084, + "flos": 727064930304.0, + "grad_norm": 0.05998205753786282, + "language_loss": 0.78935313, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80000776, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.27246094, + "step": 4712, + "time_per_iteration": 2.9277284145355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.03896201, + "epoch": 0.906694882647172, + "flos": 531254799360.0, + "grad_norm": 0.06374980878280882, + "language_loss": 0.80104047, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81170833, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.27856445, + "step": 4713, + "time_per_iteration": 2.678513765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066642, + "balance_loss_mlp": 1.03865206, + "epoch": 0.9068872643324356, + "flos": 429563146752.0, + "grad_norm": 0.06604765219045201, + "language_loss": 0.85026371, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86093009, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.27978516, + "step": 4714, + "time_per_iteration": 2.612898349761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070435, + "balance_loss_mlp": 1.04153872, + "epoch": 0.9070796460176991, + "flos": 588095161344.0, + "grad_norm": 0.06185083154796718, + "language_loss": 0.79640901, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80711341, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.28857422, + "step": 4715, + "time_per_iteration": 2.751021146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.04323435, + "epoch": 0.9072720277029627, + "flos": 571314843648.0, + "grad_norm": 0.0598768727197482, + "language_loss": 0.7539562, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76467299, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.28417969, + "step": 4716, + "time_per_iteration": 2.694836378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068538, + "balance_loss_mlp": 1.04131055, + "epoch": 0.9074644093882263, + "flos": 555534097920.0, + "grad_norm": 0.057382736808796596, + "language_loss": 0.88150144, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89218676, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.27270508, + "step": 4717, + "time_per_iteration": 2.6780333518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065963, + "balance_loss_mlp": 1.03773427, + "epoch": 0.9076567910734898, + "flos": 640701508608.0, + "grad_norm": 0.05537146753326694, + "language_loss": 0.82323325, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83389294, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.2824707, + "step": 4718, + "time_per_iteration": 2.8005425930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_mlp": 1.04367542, + "epoch": 0.9078491727587533, + "flos": 733635560448.0, + "grad_norm": 0.05081373139618053, + "language_loss": 0.81615859, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82687193, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.27709961, + "step": 4719, + "time_per_iteration": 3.0667171478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070774, + "balance_loss_mlp": 1.04249716, + "epoch": 0.9080415544440169, + "flos": 654464171520.0, + "grad_norm": 0.05232334449211869, + "language_loss": 0.86633104, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87703872, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.28295898, + "step": 4720, + "time_per_iteration": 2.834167003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066772, + "balance_loss_mlp": 1.03842413, + "epoch": 0.9082339361292805, + "flos": 597180059136.0, + "grad_norm": 0.051475992936928554, + "language_loss": 0.7933374, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80400515, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.28344727, + "step": 4721, + "time_per_iteration": 2.7342042922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107087, + "balance_loss_mlp": 1.0426892, + "epoch": 0.9084263178145441, + "flos": 504154285056.0, + "grad_norm": 0.05504270564610462, + "language_loss": 0.8449378, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85564649, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.28198242, + "step": 4722, + "time_per_iteration": 2.6053569316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065295, + "balance_loss_mlp": 1.03692365, + "epoch": 0.9086186994998077, + "flos": 549763628544.0, + "grad_norm": 0.05937577628275322, + "language_loss": 0.8047967, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.8154496, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.28393555, + "step": 4723, + "time_per_iteration": 2.7014620304107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_mlp": 1.03710771, + "epoch": 0.9088110811850711, + "flos": 1133620545024.0, + "grad_norm": 0.05397670601774565, + "language_loss": 0.7509287, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76157427, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.27490234, + "step": 4724, + "time_per_iteration": 3.5497186183929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067508, + "balance_loss_mlp": 1.03966045, + "epoch": 0.9090034628703347, + "flos": 556725726720.0, + "grad_norm": 0.06845603595782776, + "language_loss": 0.77022469, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78089976, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.27856445, + "step": 4725, + "time_per_iteration": 2.6870620250701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066604, + "balance_loss_mlp": 1.03894758, + "epoch": 0.9091958445555983, + "flos": 543697795584.0, + "grad_norm": 0.06585038795867323, + "language_loss": 0.84378177, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85444778, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.27709961, + "step": 4726, + "time_per_iteration": 2.6953237056732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_mlp": 1.0382266, + "epoch": 0.9093882262408619, + "flos": 525858269184.0, + "grad_norm": 0.06459966077589527, + "language_loss": 0.8105191, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82117581, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.27490234, + "step": 4727, + "time_per_iteration": 2.6618425846099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066073, + "balance_loss_mlp": 1.03808236, + "epoch": 0.9095806079261254, + "flos": 548266461696.0, + "grad_norm": 0.06317641405801941, + "language_loss": 0.81712091, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82778162, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.28027344, + "step": 4728, + "time_per_iteration": 2.6596298217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03681278, + "epoch": 0.909772989611389, + "flos": 572260571136.0, + "grad_norm": 0.058766129071798213, + "language_loss": 0.84501958, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85567194, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.28417969, + "step": 4729, + "time_per_iteration": 2.7634377479553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064416, + "balance_loss_mlp": 1.03675914, + "epoch": 0.9099653712966526, + "flos": 561546086400.0, + "grad_norm": 0.07496188001150708, + "language_loss": 0.79495102, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80559516, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.27685547, + "step": 4730, + "time_per_iteration": 2.6741318702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.04056227, + "epoch": 0.9101577529819161, + "flos": 1093377208320.0, + "grad_norm": 0.06415098680348416, + "language_loss": 0.80079752, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81148523, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.28186035, + "step": 4731, + "time_per_iteration": 3.351849317550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067585, + "balance_loss_mlp": 1.03904653, + "epoch": 0.9103501346671797, + "flos": 445215854592.0, + "grad_norm": 0.062092517001545014, + "language_loss": 0.81994462, + "learning_rate": 2.092919721190678e-05, + "loss": 0.83062047, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.28491211, + "step": 4732, + "time_per_iteration": 2.600543737411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068188, + "balance_loss_mlp": 1.03950608, + "epoch": 0.9105425163524432, + "flos": 500510997504.0, + "grad_norm": 0.06287463201438012, + "language_loss": 0.77314079, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78382266, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.28662109, + "step": 4733, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065533, + "balance_loss_mlp": 1.03747129, + "epoch": 0.9107348980377068, + "flos": 657206811648.0, + "grad_norm": 0.06599891093057128, + "language_loss": 0.83865237, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.84930772, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.28076172, + "step": 4734, + "time_per_iteration": 2.8651516437530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106646, + "balance_loss_mlp": 1.03863621, + "epoch": 0.9109272797229704, + "flos": 553406916096.0, + "grad_norm": 0.07392297365567703, + "language_loss": 0.84923166, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85989624, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.27856445, + "step": 4735, + "time_per_iteration": 2.628058433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068003, + "balance_loss_mlp": 1.04001248, + "epoch": 0.911119661408234, + "flos": 518757958656.0, + "grad_norm": 0.06239826153412266, + "language_loss": 0.84246588, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85314584, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.2800293, + "step": 4736, + "time_per_iteration": 2.6526265144348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066493, + "balance_loss_mlp": 1.03790689, + "epoch": 0.9113120430934974, + "flos": 554111124480.0, + "grad_norm": 0.05558966408971301, + "language_loss": 0.83016825, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84083319, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.28564453, + "step": 4737, + "time_per_iteration": 2.6624600887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064489, + "balance_loss_mlp": 1.03611708, + "epoch": 0.911504424778761, + "flos": 501624050688.0, + "grad_norm": 0.06145383290776928, + "language_loss": 0.8102991, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82094395, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.28369141, + "step": 4738, + "time_per_iteration": 2.7522430419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072434, + "balance_loss_mlp": 1.04391873, + "epoch": 0.9116968064640246, + "flos": 610823448576.0, + "grad_norm": 0.052355603259844785, + "language_loss": 0.82169437, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.8324188, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.28515625, + "step": 4739, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068519, + "balance_loss_mlp": 1.04029012, + "epoch": 0.9118891881492882, + "flos": 572625745920.0, + "grad_norm": 0.06553339401592277, + "language_loss": 0.82452631, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83521152, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.28222656, + "step": 4740, + "time_per_iteration": 2.8027913570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064254, + "balance_loss_mlp": 1.03695512, + "epoch": 0.9120815698345518, + "flos": 635659978752.0, + "grad_norm": 0.055176500742542135, + "language_loss": 0.77731133, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.78795385, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.27319336, + "step": 4741, + "time_per_iteration": 2.8225715160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066383, + "balance_loss_mlp": 1.03879797, + "epoch": 0.9122739515198153, + "flos": 701988291072.0, + "grad_norm": 0.06489997535399476, + "language_loss": 0.85749066, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86815447, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.27587891, + "step": 4742, + "time_per_iteration": 2.8944971561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071381, + "balance_loss_mlp": 1.04346228, + "epoch": 0.9124663332050789, + "flos": 524435295744.0, + "grad_norm": 0.06824129090140331, + "language_loss": 0.8727017, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88341552, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.27954102, + "step": 4743, + "time_per_iteration": 2.675344705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063959, + "balance_loss_mlp": 1.03596842, + "epoch": 0.9126587148903424, + "flos": 563033078784.0, + "grad_norm": 0.06616707154942209, + "language_loss": 0.82495749, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83559716, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.2800293, + "step": 4744, + "time_per_iteration": 2.6696653366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063829, + "balance_loss_mlp": 1.03543317, + "epoch": 0.912851096575606, + "flos": 505695121920.0, + "grad_norm": 0.06073199145800207, + "language_loss": 0.80086148, + "learning_rate": 1.978541819374574e-05, + "loss": 0.81149977, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.28393555, + "step": 4745, + "time_per_iteration": 2.578810930252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067604, + "balance_loss_mlp": 1.03970885, + "epoch": 0.9130434782608695, + "flos": 550472219136.0, + "grad_norm": 0.05936218936651509, + "language_loss": 0.82134587, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83202189, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.27905273, + "step": 4746, + "time_per_iteration": 2.6668622493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_mlp": 1.04005289, + "epoch": 0.9132358599461331, + "flos": 468737100288.0, + "grad_norm": 0.05782738716134406, + "language_loss": 0.83086479, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84154886, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.28344727, + "step": 4747, + "time_per_iteration": 2.540804147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060155, + "balance_loss_mlp": 1.03197372, + "epoch": 0.9134282416313967, + "flos": 505847890944.0, + "grad_norm": 0.06478397348859542, + "language_loss": 0.79643875, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80704033, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.28173828, + "step": 4748, + "time_per_iteration": 2.716663122177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106669, + "balance_loss_mlp": 1.03808033, + "epoch": 0.9136206233166603, + "flos": 604540827648.0, + "grad_norm": 0.06062197289258145, + "language_loss": 0.84058869, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85125566, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.28613281, + "step": 4749, + "time_per_iteration": 2.7612674236297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066673, + "balance_loss_mlp": 1.03830099, + "epoch": 0.9138130050019239, + "flos": 561467510784.0, + "grad_norm": 0.05734158508663424, + "language_loss": 0.83067048, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.84133726, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.28369141, + "step": 4750, + "time_per_iteration": 2.6877682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065085, + "balance_loss_mlp": 1.03695142, + "epoch": 0.9140053866871873, + "flos": 689811545088.0, + "grad_norm": 0.051169949793753604, + "language_loss": 0.89908755, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.90973842, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.28173828, + "step": 4751, + "time_per_iteration": 2.8582828044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064321, + "balance_loss_mlp": 1.03659272, + "epoch": 0.9141977683724509, + "flos": 550734087168.0, + "grad_norm": 0.050902490738550396, + "language_loss": 0.83958328, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85022652, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.27758789, + "step": 4752, + "time_per_iteration": 2.700676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067305, + "balance_loss_mlp": 1.0389812, + "epoch": 0.9143901500577145, + "flos": 539831927808.0, + "grad_norm": 0.05842457753383261, + "language_loss": 0.7560339, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76670694, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.28320312, + "step": 4753, + "time_per_iteration": 2.669036626815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064355, + "balance_loss_mlp": 1.03615046, + "epoch": 0.9145825317429781, + "flos": 528512159232.0, + "grad_norm": 0.05637418626712114, + "language_loss": 0.80865467, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81929827, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.2824707, + "step": 4754, + "time_per_iteration": 2.6243560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065748, + "balance_loss_mlp": 1.03723359, + "epoch": 0.9147749134282416, + "flos": 514538500608.0, + "grad_norm": 0.06489204553695826, + "language_loss": 0.7878328, + "learning_rate": 1.892702433097776e-05, + "loss": 0.79849029, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.28515625, + "step": 4755, + "time_per_iteration": 2.6430461406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.03900671, + "epoch": 0.9149672951135052, + "flos": 514174735872.0, + "grad_norm": 0.05282624485424685, + "language_loss": 0.85728586, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86795199, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.27661133, + "step": 4756, + "time_per_iteration": 2.6440606117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067698, + "balance_loss_mlp": 1.03958797, + "epoch": 0.9151596767987688, + "flos": 576781185024.0, + "grad_norm": 0.054456788510216333, + "language_loss": 0.81087077, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82154775, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.28125, + "step": 4757, + "time_per_iteration": 2.7387938499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010674, + "balance_loss_mlp": 1.03940928, + "epoch": 0.9153520584840323, + "flos": 619051368960.0, + "grad_norm": 0.05987446215333431, + "language_loss": 0.8248508, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.8355248, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.2800293, + "step": 4758, + "time_per_iteration": 2.7512242794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066234, + "balance_loss_mlp": 1.0381248, + "epoch": 0.9155444401692959, + "flos": 468687638016.0, + "grad_norm": 0.05345227999499999, + "language_loss": 0.82700217, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83766448, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.28125, + "step": 4759, + "time_per_iteration": 2.608558177947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012165, + "balance_loss_mlp": 1.0007689, + "epoch": 0.9157368218545594, + "flos": 1409931601920.0, + "grad_norm": 0.005458035356382807, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75831234, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.11376953, + "step": 4760, + "time_per_iteration": 4.859896898269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012165, + "balance_loss_mlp": 1.00076854, + "epoch": 0.915929203539823, + "flos": 1521195572736.0, + "grad_norm": 0.005460828305516296, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80588222, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.11376953, + "step": 4761, + "time_per_iteration": 4.915686368942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066102, + "balance_loss_mlp": 1.03811181, + "epoch": 0.9161215852250866, + "flos": 535480049664.0, + "grad_norm": 0.06450242723998267, + "language_loss": 0.80469358, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81535459, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.2800293, + "step": 4762, + "time_per_iteration": 2.695746421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03860331, + "epoch": 0.9163139669103502, + "flos": 590350381056.0, + "grad_norm": 0.06265162160460012, + "language_loss": 0.80308187, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81374586, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.27807617, + "step": 4763, + "time_per_iteration": 2.71388840675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.03549504, + "epoch": 0.9165063485956138, + "flos": 821627159040.0, + "grad_norm": 0.04894944964333379, + "language_loss": 0.84645033, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85709637, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.29125977, + "step": 4764, + "time_per_iteration": 3.070787191390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064261, + "balance_loss_mlp": 1.03619957, + "epoch": 0.9166987302808772, + "flos": 524932890624.0, + "grad_norm": 0.06771662225705596, + "language_loss": 0.81886947, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.82951206, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.28100586, + "step": 4765, + "time_per_iteration": 2.6674678325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066384, + "balance_loss_mlp": 1.03829777, + "epoch": 0.9168911119661408, + "flos": 654784266240.0, + "grad_norm": 0.31016783948163307, + "language_loss": 0.84433573, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85499954, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.28125, + "step": 4766, + "time_per_iteration": 2.902374267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067891, + "balance_loss_mlp": 1.03980517, + "epoch": 0.9170834936514044, + "flos": 491504675328.0, + "grad_norm": 0.06885835194999351, + "language_loss": 0.84648538, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85716426, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.28100586, + "step": 4767, + "time_per_iteration": 2.538447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03500056, + "epoch": 0.917275875336668, + "flos": 627756535296.0, + "grad_norm": 0.061905177796194595, + "language_loss": 0.80407935, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81470478, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.27587891, + "step": 4768, + "time_per_iteration": 2.7707064151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013969, + "balance_loss_mlp": 1.00262046, + "epoch": 0.9174682570219315, + "flos": 1517176935936.0, + "grad_norm": 0.007402133526123785, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79194206, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.11328125, + "step": 4769, + "time_per_iteration": 4.909727096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065476, + "balance_loss_mlp": 1.03796232, + "epoch": 0.917660638707195, + "flos": 559749173760.0, + "grad_norm": 0.05411764777194339, + "language_loss": 0.85018283, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86083758, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.27490234, + "step": 4770, + "time_per_iteration": 2.6659553050994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_mlp": 1.03958189, + "epoch": 0.9178530203924586, + "flos": 447022941696.0, + "grad_norm": 0.05527017290762028, + "language_loss": 0.8355031, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84617996, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.28149414, + "step": 4771, + "time_per_iteration": 2.547340154647827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064625, + "balance_loss_mlp": 1.03634822, + "epoch": 0.9180454020777222, + "flos": 465734002176.0, + "grad_norm": 0.06001441010446878, + "language_loss": 0.80642879, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81707501, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.28295898, + "step": 4772, + "time_per_iteration": 2.5512609481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064768, + "balance_loss_mlp": 1.03744531, + "epoch": 0.9182377837629858, + "flos": 596023335936.0, + "grad_norm": 0.06645756172963627, + "language_loss": 0.87070215, + "learning_rate": 1.74290029706784e-05, + "loss": 0.8813498, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.2734375, + "step": 4773, + "time_per_iteration": 2.845562219619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066198, + "balance_loss_mlp": 1.03706312, + "epoch": 0.9184301654482493, + "flos": 996251249664.0, + "grad_norm": 0.05732398262370566, + "language_loss": 0.82560432, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83626628, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.29125977, + "step": 4774, + "time_per_iteration": 3.335674524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064966, + "balance_loss_mlp": 1.03685653, + "epoch": 0.9186225471335129, + "flos": 508600705536.0, + "grad_norm": 0.04683275579109834, + "language_loss": 0.84353292, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.8541826, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.28100586, + "step": 4775, + "time_per_iteration": 2.6537563800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065661, + "balance_loss_mlp": 1.03690755, + "epoch": 0.9188149288187765, + "flos": 940011789312.0, + "grad_norm": 0.05975738892977174, + "language_loss": 0.7872526, + "learning_rate": 1.718522925136551e-05, + "loss": 0.79790926, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.28759766, + "step": 4776, + "time_per_iteration": 3.2783892154693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060807, + "balance_loss_mlp": 1.03284085, + "epoch": 0.91900731050404, + "flos": 583402839552.0, + "grad_norm": 0.05439818019426215, + "language_loss": 0.83903718, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.84964526, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.2800293, + "step": 4777, + "time_per_iteration": 2.7296934127807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063777, + "balance_loss_mlp": 1.03614461, + "epoch": 0.9191996921893035, + "flos": 580941006336.0, + "grad_norm": 0.06874414122365366, + "language_loss": 0.79326808, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80390579, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.27661133, + "step": 4778, + "time_per_iteration": 2.688161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106495, + "balance_loss_mlp": 1.03626871, + "epoch": 0.9193920738745671, + "flos": 908566751232.0, + "grad_norm": 0.06089128327905484, + "language_loss": 0.80218065, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81283021, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.28710938, + "step": 4779, + "time_per_iteration": 3.1092312335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017454, + "balance_loss_mlp": 1.00615335, + "epoch": 0.9195844555598307, + "flos": 1557557074944.0, + "grad_norm": 0.00875582721078654, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80812848, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.11279297, + "step": 4780, + "time_per_iteration": 4.68978214263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.03633761, + "epoch": 0.9197768372450943, + "flos": 473813535744.0, + "grad_norm": 0.07949185177818381, + "language_loss": 0.78788704, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79853034, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.2800293, + "step": 4781, + "time_per_iteration": 2.57722544670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.03614509, + "epoch": 0.9199692189303579, + "flos": 856622352384.0, + "grad_norm": 0.0600534002616839, + "language_loss": 0.84106392, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85170579, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.28051758, + "step": 4782, + "time_per_iteration": 3.200462818145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069787, + "balance_loss_mlp": 1.04131949, + "epoch": 0.9201616006156214, + "flos": 504144110592.0, + "grad_norm": 0.06544428358770026, + "language_loss": 0.7733472, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78404504, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.28442383, + "step": 4783, + "time_per_iteration": 2.631211042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_mlp": 1.03697455, + "epoch": 0.9203539823008849, + "flos": 548503598592.0, + "grad_norm": 0.052623927296135346, + "language_loss": 0.84798336, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.85863233, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.27929688, + "step": 4784, + "time_per_iteration": 2.7127954959869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065976, + "balance_loss_mlp": 1.03824794, + "epoch": 0.9205463639861485, + "flos": 539738795520.0, + "grad_norm": 0.10290216948314322, + "language_loss": 0.82366759, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83432734, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.27734375, + "step": 4785, + "time_per_iteration": 2.635606527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064956, + "balance_loss_mlp": 1.03684688, + "epoch": 0.9207387456714121, + "flos": 799367353344.0, + "grad_norm": 0.0634001865555332, + "language_loss": 0.77785552, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.78850508, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.28125, + "step": 4786, + "time_per_iteration": 3.0849039554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067889, + "balance_loss_mlp": 1.03977942, + "epoch": 0.9209311273566756, + "flos": 502607655936.0, + "grad_norm": 0.05967809159970589, + "language_loss": 0.78313106, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79380995, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.28125, + "step": 4787, + "time_per_iteration": 2.667234182357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.03841579, + "epoch": 0.9211235090419392, + "flos": 569059623936.0, + "grad_norm": 0.06420045870845124, + "language_loss": 0.82682192, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83748651, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.28051758, + "step": 4788, + "time_per_iteration": 2.7027649879455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066666, + "balance_loss_mlp": 1.03879452, + "epoch": 0.9213158907272028, + "flos": 806205795840.0, + "grad_norm": 0.07333708723290279, + "language_loss": 0.82389617, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.8345629, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.27880859, + "step": 4789, + "time_per_iteration": 3.0040667057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064674, + "balance_loss_mlp": 1.03623104, + "epoch": 0.9215082724124664, + "flos": 490441084416.0, + "grad_norm": 0.05377371866871187, + "language_loss": 0.75874245, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.76938921, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.28442383, + "step": 4790, + "time_per_iteration": 2.5756027698516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011385, + "balance_loss_mlp": 1.0000844, + "epoch": 0.9217006540977299, + "flos": 1513648539648.0, + "grad_norm": 0.004042788422816454, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.7808165, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.11279297, + "step": 4791, + "time_per_iteration": 4.973644018173218 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067727, + "balance_loss_mlp": 1.03909278, + "epoch": 0.9218930357829934, + "flos": 743471308800.0, + "grad_norm": 0.057601458706605435, + "language_loss": 0.76279974, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77347702, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.28613281, + "step": 4792, + "time_per_iteration": 2.9516866207122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060038, + "balance_loss_mlp": 1.03223848, + "epoch": 0.922085417468257, + "flos": 452803585536.0, + "grad_norm": 0.05595873441646043, + "language_loss": 0.80235362, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81295407, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.27807617, + "step": 4793, + "time_per_iteration": 2.5641462802886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062665, + "balance_loss_mlp": 1.03479338, + "epoch": 0.9222777991535206, + "flos": 500003228160.0, + "grad_norm": 0.059727249910016274, + "language_loss": 0.85150099, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86212766, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.27880859, + "step": 4794, + "time_per_iteration": 2.593534231185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069605, + "balance_loss_mlp": 1.04101813, + "epoch": 0.9224701808387842, + "flos": 527704644096.0, + "grad_norm": 0.10466348651463832, + "language_loss": 0.78718358, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79787964, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.28540039, + "step": 4795, + "time_per_iteration": 2.6495256423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066319, + "balance_loss_mlp": 1.03882968, + "epoch": 0.9226625625240477, + "flos": 874272794112.0, + "grad_norm": 0.05332169239610604, + "language_loss": 0.75038373, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76104683, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.27563477, + "step": 4796, + "time_per_iteration": 3.1437277793884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_mlp": 1.04029274, + "epoch": 0.9228549442093112, + "flos": 502529080320.0, + "grad_norm": 0.06430684376904929, + "language_loss": 0.88128197, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89195955, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.27490234, + "step": 4797, + "time_per_iteration": 2.5683467388153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106672, + "balance_loss_mlp": 1.03853893, + "epoch": 0.9230473258945748, + "flos": 599705911296.0, + "grad_norm": 0.05378810065548013, + "language_loss": 0.8519541, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86262131, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.28173828, + "step": 4798, + "time_per_iteration": 2.7920963764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_mlp": 1.04027247, + "epoch": 0.9232397075798384, + "flos": 822168423936.0, + "grad_norm": 0.056516225106437785, + "language_loss": 0.79586041, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80654448, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.28149414, + "step": 4799, + "time_per_iteration": 3.106332302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063986, + "balance_loss_mlp": 1.03666353, + "epoch": 0.923432089265102, + "flos": 706719900672.0, + "grad_norm": 0.056508741932020275, + "language_loss": 0.84739339, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.8580333, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.2734375, + "step": 4800, + "time_per_iteration": 2.9519155025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067591, + "balance_loss_mlp": 1.03986311, + "epoch": 0.9236244709503655, + "flos": 701554715136.0, + "grad_norm": 0.066698066809805, + "language_loss": 0.76543391, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77610976, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.27758789, + "step": 4801, + "time_per_iteration": 2.848271131515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067135, + "balance_loss_mlp": 1.03895378, + "epoch": 0.9238168526356291, + "flos": 514780019712.0, + "grad_norm": 0.06767841567088752, + "language_loss": 0.83995795, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85062933, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.28198242, + "step": 4802, + "time_per_iteration": 2.6541097164154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067162, + "balance_loss_mlp": 1.03881443, + "epoch": 0.9240092343208927, + "flos": 491789864448.0, + "grad_norm": 0.05859515366880438, + "language_loss": 0.81224668, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.8229183, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.28369141, + "step": 4803, + "time_per_iteration": 2.564298391342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.03745484, + "epoch": 0.9242016160061562, + "flos": 646915728384.0, + "grad_norm": 0.0605331222731167, + "language_loss": 0.73584902, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74649346, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.27026367, + "step": 4804, + "time_per_iteration": 2.9267265796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065448, + "balance_loss_mlp": 1.0383395, + "epoch": 0.9243939976914197, + "flos": 729094597632.0, + "grad_norm": 0.06139671428886114, + "language_loss": 0.79408431, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80473882, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.27148438, + "step": 4805, + "time_per_iteration": 3.0169341564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068038, + "balance_loss_mlp": 1.0397377, + "epoch": 0.9245863793766833, + "flos": 452006244864.0, + "grad_norm": 0.07511640502598389, + "language_loss": 0.90425861, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91493905, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.28320312, + "step": 4806, + "time_per_iteration": 2.5978922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106683, + "balance_loss_mlp": 1.03974569, + "epoch": 0.9247787610619469, + "flos": 754697945088.0, + "grad_norm": 0.06258160431282603, + "language_loss": 0.76767433, + "learning_rate": 1.476516966469732e-05, + "loss": 0.7783426, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.27124023, + "step": 4807, + "time_per_iteration": 2.956561803817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069239, + "balance_loss_mlp": 1.04010475, + "epoch": 0.9249711427472105, + "flos": 561640628736.0, + "grad_norm": 0.055778501240304965, + "language_loss": 0.84958422, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86027658, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.29125977, + "step": 4808, + "time_per_iteration": 2.7347190380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066354, + "balance_loss_mlp": 1.03819704, + "epoch": 0.9251635244324741, + "flos": 526430057472.0, + "grad_norm": 0.06318769230203738, + "language_loss": 0.85102391, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86168742, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.28173828, + "step": 4809, + "time_per_iteration": 2.6991071701049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065383, + "balance_loss_mlp": 1.03724957, + "epoch": 0.9253559061177375, + "flos": 610982009856.0, + "grad_norm": 0.10096377008479462, + "language_loss": 0.7862674, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.79692125, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.28149414, + "step": 4810, + "time_per_iteration": 2.8498053550720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101132, + "balance_loss_mlp": 1.00006664, + "epoch": 0.9255482878030011, + "flos": 1550461146624.0, + "grad_norm": 0.00391096047645047, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77936709, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.11230469, + "step": 4811, + "time_per_iteration": 4.764047861099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106661, + "balance_loss_mlp": 1.03804743, + "epoch": 0.9257406694882647, + "flos": 766008949248.0, + "grad_norm": 0.06895968648743506, + "language_loss": 0.80879593, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.81946206, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.28588867, + "step": 4812, + "time_per_iteration": 3.061112880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063531, + "balance_loss_mlp": 1.03599334, + "epoch": 0.9259330511735283, + "flos": 497748008448.0, + "grad_norm": 0.058143604426202734, + "language_loss": 0.83249688, + "learning_rate": 1.431765421986686e-05, + "loss": 0.8431322, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.27563477, + "step": 4813, + "time_per_iteration": 2.643853187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03941989, + "epoch": 0.9261254328587919, + "flos": 626589637632.0, + "grad_norm": 0.061231728055144215, + "language_loss": 0.79372674, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80439967, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.27880859, + "step": 4814, + "time_per_iteration": 2.793656349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067503, + "balance_loss_mlp": 1.03970385, + "epoch": 0.9263178145440554, + "flos": 597105865728.0, + "grad_norm": 0.05834470087321101, + "language_loss": 0.85063499, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86131001, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.27856445, + "step": 4815, + "time_per_iteration": 2.7759041786193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066213, + "balance_loss_mlp": 1.03779304, + "epoch": 0.926510196229319, + "flos": 388350761472.0, + "grad_norm": 0.07585655013860047, + "language_loss": 0.83582151, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84648359, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.28393555, + "step": 4816, + "time_per_iteration": 2.5199952125549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066914, + "balance_loss_mlp": 1.03937626, + "epoch": 0.9267025779145825, + "flos": 545533996032.0, + "grad_norm": 0.07628608002460177, + "language_loss": 0.8428371, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85350621, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.27563477, + "step": 4817, + "time_per_iteration": 2.6322243213653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069066, + "balance_loss_mlp": 1.04109931, + "epoch": 0.9268949595998461, + "flos": 499540538880.0, + "grad_norm": 0.056009628743491614, + "language_loss": 0.81786913, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.82855976, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.27978516, + "step": 4818, + "time_per_iteration": 2.652061700820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064068, + "balance_loss_mlp": 1.03619719, + "epoch": 0.9270873412851096, + "flos": 432601150464.0, + "grad_norm": 0.060612630955757626, + "language_loss": 0.82984769, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.84048837, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.27880859, + "step": 4819, + "time_per_iteration": 2.693363666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063444, + "balance_loss_mlp": 1.03633547, + "epoch": 0.9272797229703732, + "flos": 466512403968.0, + "grad_norm": 0.05620589668035287, + "language_loss": 0.85918975, + "learning_rate": 1.380413270847164e-05, + "loss": 0.86982417, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.27148438, + "step": 4820, + "time_per_iteration": 2.6197123527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063733, + "balance_loss_mlp": 1.03567159, + "epoch": 0.9274721046556368, + "flos": 704486439936.0, + "grad_norm": 0.05510862949217126, + "language_loss": 0.78793794, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79857528, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.28076172, + "step": 4821, + "time_per_iteration": 3.0157546997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009196, + "balance_loss_mlp": 0.99789476, + "epoch": 0.9276644863409004, + "flos": 1401486893568.0, + "grad_norm": 0.004102053243796757, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83389658, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.11279297, + "step": 4822, + "time_per_iteration": 4.890833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_mlp": 1.03830612, + "epoch": 0.927856868026164, + "flos": 741370268160.0, + "grad_norm": 0.06118264568584961, + "language_loss": 0.80120695, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.8118661, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.27636719, + "step": 4823, + "time_per_iteration": 3.035834550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.04052949, + "epoch": 0.9280492497114274, + "flos": 412000045056.0, + "grad_norm": 0.06263010480012486, + "language_loss": 0.7379303, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.74861383, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.27832031, + "step": 4824, + "time_per_iteration": 2.461954355239868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067965, + "balance_loss_mlp": 1.03964114, + "epoch": 0.928241631396691, + "flos": 646215902208.0, + "grad_norm": 0.0608824957877414, + "language_loss": 0.84066081, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85134053, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.28320312, + "step": 4825, + "time_per_iteration": 2.787639617919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064738, + "balance_loss_mlp": 1.03674757, + "epoch": 0.9284340130819546, + "flos": 696537916416.0, + "grad_norm": 0.05281576196945734, + "language_loss": 0.80929303, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81994045, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.27978516, + "step": 4826, + "time_per_iteration": 2.9434425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106758, + "balance_loss_mlp": 1.03951824, + "epoch": 0.9286263947672182, + "flos": 758780600832.0, + "grad_norm": 0.05077416980052692, + "language_loss": 0.8357712, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84644705, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.28076172, + "step": 4827, + "time_per_iteration": 3.032938241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_mlp": 1.03722429, + "epoch": 0.9288187764524817, + "flos": 672495754752.0, + "grad_norm": 0.05731069309896637, + "language_loss": 0.80015826, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81081235, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.28198242, + "step": 4828, + "time_per_iteration": 2.917309522628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064578, + "balance_loss_mlp": 1.03715968, + "epoch": 0.9290111581377453, + "flos": 500220016128.0, + "grad_norm": 0.059623319858346985, + "language_loss": 0.83761036, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84825623, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.27490234, + "step": 4829, + "time_per_iteration": 2.6183080673217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007339, + "balance_loss_mlp": 0.99599022, + "epoch": 0.9292035398230089, + "flos": 1562773132800.0, + "grad_norm": 0.004808076693014282, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73129404, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.11328125, + "step": 4830, + "time_per_iteration": 4.92920446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100734, + "balance_loss_mlp": 0.99599123, + "epoch": 0.9293959215082724, + "flos": 1517828709888.0, + "grad_norm": 0.004800596533171945, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80519176, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.11328125, + "step": 4831, + "time_per_iteration": 4.897861003875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.04098451, + "epoch": 0.929588303193536, + "flos": 557572529664.0, + "grad_norm": 0.08301907154555206, + "language_loss": 0.84292293, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85360765, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.27514648, + "step": 4832, + "time_per_iteration": 2.660130023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066483, + "balance_loss_mlp": 1.03892231, + "epoch": 0.9297806848787995, + "flos": 478338531840.0, + "grad_norm": 0.07295774266444127, + "language_loss": 0.79771066, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.80837542, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.27612305, + "step": 4833, + "time_per_iteration": 2.603111982345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064962, + "balance_loss_mlp": 1.03737652, + "epoch": 0.9299730665640631, + "flos": 564259613184.0, + "grad_norm": 0.05026331549954224, + "language_loss": 0.80081427, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81146395, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.27612305, + "step": 4834, + "time_per_iteration": 2.769982099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069532, + "balance_loss_mlp": 1.04094601, + "epoch": 0.9301654482493267, + "flos": 559883003904.0, + "grad_norm": 0.06905746243332382, + "language_loss": 0.82341313, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83410847, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.28564453, + "step": 4835, + "time_per_iteration": 2.8149759769439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009283, + "balance_loss_mlp": 0.9979341, + "epoch": 0.9303578299345903, + "flos": 1519251683328.0, + "grad_norm": 0.00409976409094526, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77861726, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.11328125, + "step": 4836, + "time_per_iteration": 4.970911979675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063815, + "balance_loss_mlp": 1.03622973, + "epoch": 0.9305502116198537, + "flos": 530589878784.0, + "grad_norm": 0.06739289017975023, + "language_loss": 0.83095217, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84159029, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.27612305, + "step": 4837, + "time_per_iteration": 2.6105446815490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063157, + "balance_loss_mlp": 1.03542876, + "epoch": 0.9307425933051173, + "flos": 474660338688.0, + "grad_norm": 0.08448682706583123, + "language_loss": 0.81358898, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82422054, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.27758789, + "step": 4838, + "time_per_iteration": 2.5410094261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066167, + "balance_loss_mlp": 1.03817677, + "epoch": 0.9309349749903809, + "flos": 584600260608.0, + "grad_norm": 0.056392712397074544, + "language_loss": 0.8662045, + "learning_rate": 1.245693929549213e-05, + "loss": 0.8768661, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.27978516, + "step": 4839, + "time_per_iteration": 2.7685422897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068427, + "balance_loss_mlp": 1.04088926, + "epoch": 0.9311273566756445, + "flos": 861298707456.0, + "grad_norm": 0.05211302264948049, + "language_loss": 0.76617467, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77685893, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.27587891, + "step": 4840, + "time_per_iteration": 3.0811336040496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063666, + "balance_loss_mlp": 1.03512692, + "epoch": 0.9313197383609081, + "flos": 547828503552.0, + "grad_norm": 0.05632257898904223, + "language_loss": 0.82257402, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83321071, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.28564453, + "step": 4841, + "time_per_iteration": 2.6456518173217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_mlp": 1.03546572, + "epoch": 0.9315121200461716, + "flos": 468520312320.0, + "grad_norm": 0.06835542650299899, + "language_loss": 0.81116635, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82179779, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.27709961, + "step": 4842, + "time_per_iteration": 2.51662015914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_mlp": 1.03777909, + "epoch": 0.9317045017314352, + "flos": 417435863040.0, + "grad_norm": 0.08457749327119382, + "language_loss": 0.77490675, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78556347, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.27929688, + "step": 4843, + "time_per_iteration": 2.5226502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067033, + "balance_loss_mlp": 1.03863692, + "epoch": 0.9318968834166987, + "flos": 540207277056.0, + "grad_norm": 0.0629878065349501, + "language_loss": 0.77101374, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78168404, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.28393555, + "step": 4844, + "time_per_iteration": 2.7928385734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066678, + "balance_loss_mlp": 1.03959417, + "epoch": 0.9320892651019623, + "flos": 521077197312.0, + "grad_norm": 0.07893659024382914, + "language_loss": 0.80772531, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81839204, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.27148438, + "step": 4845, + "time_per_iteration": 2.6249451637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066098, + "balance_loss_mlp": 1.03848934, + "epoch": 0.9322816467872258, + "flos": 581779044864.0, + "grad_norm": 0.073776710078104, + "language_loss": 0.80633116, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.8169921, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.27636719, + "step": 4846, + "time_per_iteration": 2.76720929145813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066342, + "balance_loss_mlp": 1.03830385, + "epoch": 0.9324740284724894, + "flos": 484484350464.0, + "grad_norm": 0.0546236987081132, + "language_loss": 0.8169229, + "learning_rate": 1.191013150742537e-05, + "loss": 0.82758635, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.28051758, + "step": 4847, + "time_per_iteration": 2.705033540725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.03544426, + "epoch": 0.932666410157753, + "flos": 732227143680.0, + "grad_norm": 0.05343238410156727, + "language_loss": 0.82291055, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83355719, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.29162598, + "step": 4848, + "time_per_iteration": 3.050323009490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062876, + "balance_loss_mlp": 1.03505254, + "epoch": 0.9328587918430166, + "flos": 965127716352.0, + "grad_norm": 0.05192892613374428, + "language_loss": 0.78535151, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79598027, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.27832031, + "step": 4849, + "time_per_iteration": 3.2585856914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065264, + "balance_loss_mlp": 1.03708267, + "epoch": 0.9330511735282802, + "flos": 614270297088.0, + "grad_norm": 0.06344871555323196, + "language_loss": 0.80523133, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81588399, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.28173828, + "step": 4850, + "time_per_iteration": 2.7109756469726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.03857827, + "epoch": 0.9332435552135436, + "flos": 558823795200.0, + "grad_norm": 0.056851249126662895, + "language_loss": 0.85547817, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86614954, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.28540039, + "step": 4851, + "time_per_iteration": 2.687770128250122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063314, + "balance_loss_mlp": 1.0353713, + "epoch": 0.9334359368988072, + "flos": 515281996800.0, + "grad_norm": 0.052446357449260315, + "language_loss": 0.81798899, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82862216, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.27954102, + "step": 4852, + "time_per_iteration": 2.627295970916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064942, + "balance_loss_mlp": 1.03778601, + "epoch": 0.9336283185840708, + "flos": 539527799808.0, + "grad_norm": 0.048937576786060644, + "language_loss": 0.82746959, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83811903, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.2722168, + "step": 4853, + "time_per_iteration": 2.7801096439361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007035, + "balance_loss_mlp": 0.99568605, + "epoch": 0.9338207002693344, + "flos": 1562003495424.0, + "grad_norm": 0.003590229468680035, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79462051, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.11328125, + "step": 4854, + "time_per_iteration": 4.905395746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_mlp": 1.03570247, + "epoch": 0.9340130819545979, + "flos": 644951490048.0, + "grad_norm": 0.10383389048571988, + "language_loss": 0.81319606, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82383037, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.27758789, + "step": 4855, + "time_per_iteration": 2.9259116649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063971, + "balance_loss_mlp": 1.03652906, + "epoch": 0.9342054636398615, + "flos": 503175062016.0, + "grad_norm": 0.058422853939071095, + "language_loss": 0.76883429, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77947402, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.27429199, + "step": 4856, + "time_per_iteration": 2.6962661743164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066201, + "balance_loss_mlp": 1.03816259, + "epoch": 0.934397845325125, + "flos": 592724874240.0, + "grad_norm": 0.04989142835749334, + "language_loss": 0.84225118, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85291314, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.28051758, + "step": 4857, + "time_per_iteration": 2.89798903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.03750432, + "epoch": 0.9345902270103886, + "flos": 499643845632.0, + "grad_norm": 0.059259414346205984, + "language_loss": 0.80253309, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81318712, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.27905273, + "step": 4858, + "time_per_iteration": 2.6194543838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007919, + "balance_loss_mlp": 0.99657035, + "epoch": 0.9347826086956522, + "flos": 1519563165696.0, + "grad_norm": 0.0037344003597183113, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76995641, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.11328125, + "step": 4859, + "time_per_iteration": 4.687377452850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063938, + "balance_loss_mlp": 1.0351609, + "epoch": 0.9349749903809157, + "flos": 504273558528.0, + "grad_norm": 0.05342173918778132, + "language_loss": 0.80887705, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.81951642, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.28759766, + "step": 4860, + "time_per_iteration": 2.833953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_mlp": 1.03792143, + "epoch": 0.9351673720661793, + "flos": 568636222464.0, + "grad_norm": 0.07222784760329864, + "language_loss": 0.78340459, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79406255, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.27880859, + "step": 4861, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106113, + "balance_loss_mlp": 1.0336163, + "epoch": 0.9353597537514429, + "flos": 544342367232.0, + "grad_norm": 0.06260642991207148, + "language_loss": 0.86519629, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87580758, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.27539062, + "step": 4862, + "time_per_iteration": 2.633547067642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063681, + "balance_loss_mlp": 1.03521395, + "epoch": 0.9355521354367065, + "flos": 518743401984.0, + "grad_norm": 0.0581520502605348, + "language_loss": 0.84730381, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85794055, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.28442383, + "step": 4863, + "time_per_iteration": 2.7387821674346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064, + "balance_loss_mlp": 1.0354135, + "epoch": 0.93574451712197, + "flos": 446087388672.0, + "grad_norm": 0.06553565178892076, + "language_loss": 0.78680766, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79744768, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.28564453, + "step": 4864, + "time_per_iteration": 2.493662118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063373, + "balance_loss_mlp": 1.03504932, + "epoch": 0.9359368988072335, + "flos": 480273656832.0, + "grad_norm": 0.06767871177547606, + "language_loss": 0.7636739, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77430761, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.28295898, + "step": 4865, + "time_per_iteration": 2.520573854446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062064, + "balance_loss_mlp": 1.03369164, + "epoch": 0.9361292804924971, + "flos": 496876474368.0, + "grad_norm": 0.06249909871095247, + "language_loss": 0.84898299, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.85960364, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.28417969, + "step": 4866, + "time_per_iteration": 2.6384427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059595, + "balance_loss_mlp": 1.03227186, + "epoch": 0.9363216621777607, + "flos": 617830626816.0, + "grad_norm": 0.05655870704984646, + "language_loss": 0.84264755, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85324347, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.27368164, + "step": 4867, + "time_per_iteration": 2.7363386154174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005928, + "balance_loss_mlp": 0.99457914, + "epoch": 0.9365140438630243, + "flos": 1415169570816.0, + "grad_norm": 0.004522338300868298, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80209267, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.11328125, + "step": 4868, + "time_per_iteration": 4.889655828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066052, + "balance_loss_mlp": 1.03765643, + "epoch": 0.9367064255482878, + "flos": 590217960960.0, + "grad_norm": 0.06477461898432092, + "language_loss": 0.81238163, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82304209, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.28393555, + "step": 4869, + "time_per_iteration": 2.7326934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063402, + "balance_loss_mlp": 1.03545952, + "epoch": 0.9368988072335513, + "flos": 526384977408.0, + "grad_norm": 0.06786641398202575, + "language_loss": 0.82115895, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83179295, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.27978516, + "step": 4870, + "time_per_iteration": 2.7047648429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064712, + "balance_loss_mlp": 1.03648376, + "epoch": 0.9370911889188149, + "flos": 742880581632.0, + "grad_norm": 0.05856438164101436, + "language_loss": 0.78791976, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79856682, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.28222656, + "step": 4871, + "time_per_iteration": 2.939244270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_mlp": 1.03388393, + "epoch": 0.9372835706040785, + "flos": 545779897344.0, + "grad_norm": 0.051186143222515454, + "language_loss": 0.78588909, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79650283, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.27539062, + "step": 4872, + "time_per_iteration": 2.620211362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062731, + "balance_loss_mlp": 1.03459811, + "epoch": 0.9374759522893421, + "flos": 491367873024.0, + "grad_norm": 0.057187231677836646, + "language_loss": 0.81548411, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82611144, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.28149414, + "step": 4873, + "time_per_iteration": 2.6956076622009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064273, + "balance_loss_mlp": 1.03685474, + "epoch": 0.9376683339746056, + "flos": 578144521728.0, + "grad_norm": 0.07694020765815146, + "language_loss": 0.82509339, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.8357361, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.2746582, + "step": 4874, + "time_per_iteration": 2.6782383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.0343821, + "epoch": 0.9378607156598692, + "flos": 506039947776.0, + "grad_norm": 0.06987708910160345, + "language_loss": 0.80854172, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81917691, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.29125977, + "step": 4875, + "time_per_iteration": 2.655191659927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062623, + "balance_loss_mlp": 1.03413165, + "epoch": 0.9380530973451328, + "flos": 519753148416.0, + "grad_norm": 0.06331858245443533, + "language_loss": 0.77412724, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.7847535, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.28466797, + "step": 4876, + "time_per_iteration": 2.658334255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063725, + "balance_loss_mlp": 1.03606796, + "epoch": 0.9382454790303963, + "flos": 557533241856.0, + "grad_norm": 0.061367616716062376, + "language_loss": 0.8458181, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85645533, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.27661133, + "step": 4877, + "time_per_iteration": 2.6817362308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065641, + "balance_loss_mlp": 1.03722119, + "epoch": 0.9384378607156598, + "flos": 695149848576.0, + "grad_norm": 0.05571055907247939, + "language_loss": 0.8157208, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82637721, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.28417969, + "step": 4878, + "time_per_iteration": 2.924126386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_mlp": 1.03533387, + "epoch": 0.9386302424009234, + "flos": 554480681472.0, + "grad_norm": 0.06557840572929766, + "language_loss": 0.80646306, + "learning_rate": 9.844307158203058e-06, + "loss": 0.81710184, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.28540039, + "step": 4879, + "time_per_iteration": 2.68676495552063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062372, + "balance_loss_mlp": 1.0342859, + "epoch": 0.938822624086187, + "flos": 566711271936.0, + "grad_norm": 0.05994430498236734, + "language_loss": 0.79781514, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80843884, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.28100586, + "step": 4880, + "time_per_iteration": 2.6532607078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063834, + "balance_loss_mlp": 1.03615308, + "epoch": 0.9390150057714506, + "flos": 417367461888.0, + "grad_norm": 0.05571972818867672, + "language_loss": 0.80477625, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81541461, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.27709961, + "step": 4881, + "time_per_iteration": 2.5952963829040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060824, + "balance_loss_mlp": 1.03240418, + "epoch": 0.9392073874567142, + "flos": 1553281256448.0, + "grad_norm": 0.057309564525933866, + "language_loss": 0.76139069, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77199888, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.28417969, + "step": 4882, + "time_per_iteration": 3.7136471271514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065968, + "balance_loss_mlp": 1.03766727, + "epoch": 0.9393997691419776, + "flos": 652238065152.0, + "grad_norm": 0.05812086572072492, + "language_loss": 0.78156579, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79222548, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.28295898, + "step": 4883, + "time_per_iteration": 2.781167984008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005014, + "balance_loss_mlp": 0.99371332, + "epoch": 0.9395921508272412, + "flos": 1552480639488.0, + "grad_norm": 0.004750950893681344, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79175687, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.11279297, + "step": 4884, + "time_per_iteration": 4.841533899307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065675, + "balance_loss_mlp": 1.0382328, + "epoch": 0.9397845325125048, + "flos": 497881838592.0, + "grad_norm": 0.05445625931005124, + "language_loss": 0.78697509, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79763186, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.27441406, + "step": 4885, + "time_per_iteration": 2.627277135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010635, + "balance_loss_mlp": 1.03550982, + "epoch": 0.9399769141977684, + "flos": 498348910080.0, + "grad_norm": 0.08093151667786662, + "language_loss": 0.83377492, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84440994, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.2800293, + "step": 4886, + "time_per_iteration": 2.5997188091278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_mlp": 0.99370664, + "epoch": 0.9401692958830319, + "flos": 1401709473792.0, + "grad_norm": 0.004746445275638401, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80337197, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.11279297, + "step": 4887, + "time_per_iteration": 4.797895431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.03883338, + "epoch": 0.9403616775682955, + "flos": 539852276736.0, + "grad_norm": 0.04356257563048395, + "language_loss": 0.84935153, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86002755, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.28808594, + "step": 4888, + "time_per_iteration": 2.6986100673675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004991, + "balance_loss_mlp": 0.99369013, + "epoch": 0.9405540592535591, + "flos": 1321340663808.0, + "grad_norm": 0.004744801636887555, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76174426, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.11279297, + "step": 4889, + "time_per_iteration": 4.883483648300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.03843164, + "epoch": 0.9407464409388226, + "flos": 572097627648.0, + "grad_norm": 0.09796723375615995, + "language_loss": 0.82906234, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83972561, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.27929688, + "step": 4890, + "time_per_iteration": 2.6586780548095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064949, + "balance_loss_mlp": 1.03652978, + "epoch": 0.9409388226240862, + "flos": 510994137600.0, + "grad_norm": 0.050284661991451346, + "language_loss": 0.76816261, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77881205, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.28393555, + "step": 4891, + "time_per_iteration": 2.7174863815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006369, + "balance_loss_mlp": 0.99506766, + "epoch": 0.9411312043093497, + "flos": 1569060135936.0, + "grad_norm": 0.0038787350067584375, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81248254, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.11279297, + "step": 4892, + "time_per_iteration": 4.926965236663818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.03666997, + "epoch": 0.9413235859946133, + "flos": 569197836288.0, + "grad_norm": 0.07544734388954553, + "language_loss": 0.78440136, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79504317, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.27514648, + "step": 4893, + "time_per_iteration": 2.810511827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.04094052, + "epoch": 0.9415159676798769, + "flos": 781567114752.0, + "grad_norm": 0.062357889378770605, + "language_loss": 0.80282962, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81352556, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.28662109, + "step": 4894, + "time_per_iteration": 3.0641191005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_mlp": 1.0392108, + "epoch": 0.9417083493651405, + "flos": 848960428032.0, + "grad_norm": 0.05312406489604803, + "language_loss": 0.79909003, + "learning_rate": 8.884417661086331e-06, + "loss": 0.80977184, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.28955078, + "step": 4895, + "time_per_iteration": 3.1742238998413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064713, + "balance_loss_mlp": 1.0362221, + "epoch": 0.941900731050404, + "flos": 529054834176.0, + "grad_norm": 0.053131997206903085, + "language_loss": 0.85986912, + "learning_rate": 8.826044268024025e-06, + "loss": 0.87051624, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.28491211, + "step": 4896, + "time_per_iteration": 2.68365740776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066337, + "balance_loss_mlp": 1.03803682, + "epoch": 0.9420931127356675, + "flos": 556799920128.0, + "grad_norm": 0.05603051952986068, + "language_loss": 0.8033452, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81400859, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.28320312, + "step": 4897, + "time_per_iteration": 2.789910078048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064812, + "balance_loss_mlp": 1.03653598, + "epoch": 0.9422854944209311, + "flos": 652233682944.0, + "grad_norm": 0.06641212670378875, + "language_loss": 0.86446559, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87511379, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.28295898, + "step": 4898, + "time_per_iteration": 2.854471206665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062929, + "balance_loss_mlp": 1.03436613, + "epoch": 0.9424778761061947, + "flos": 553417090560.0, + "grad_norm": 0.05214304226628579, + "language_loss": 0.84051895, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85114825, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.28588867, + "step": 4899, + "time_per_iteration": 2.7259373664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062683, + "balance_loss_mlp": 1.03505075, + "epoch": 0.9426702577914583, + "flos": 588287218176.0, + "grad_norm": 0.067020017244683, + "language_loss": 0.79881752, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80944431, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.27685547, + "step": 4900, + "time_per_iteration": 2.6749682426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071741, + "balance_loss_mlp": 1.04353571, + "epoch": 0.9428626394767218, + "flos": 616329077760.0, + "grad_norm": 0.06073739740547212, + "language_loss": 0.7828182, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79353559, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.28222656, + "step": 4901, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063214, + "balance_loss_mlp": 1.03515244, + "epoch": 0.9430550211619854, + "flos": 610129414656.0, + "grad_norm": 0.06189608765953851, + "language_loss": 0.81724429, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82787645, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.28125, + "step": 4902, + "time_per_iteration": 2.716689109802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066777, + "balance_loss_mlp": 1.03890562, + "epoch": 0.943247402847249, + "flos": 565726256640.0, + "grad_norm": 0.06175079679005683, + "language_loss": 0.78484106, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79550886, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.27905273, + "step": 4903, + "time_per_iteration": 2.7039542198181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062834, + "balance_loss_mlp": 1.03515339, + "epoch": 0.9434397845325125, + "flos": 526779265536.0, + "grad_norm": 0.06498343136748494, + "language_loss": 0.81188715, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82251555, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.27734375, + "step": 4904, + "time_per_iteration": 2.6787548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064293, + "balance_loss_mlp": 1.03608775, + "epoch": 0.943632166217776, + "flos": 593167214592.0, + "grad_norm": 0.054946869384208306, + "language_loss": 0.82257801, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83322096, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.2824707, + "step": 4905, + "time_per_iteration": 2.7595133781433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067293, + "balance_loss_mlp": 1.03875446, + "epoch": 0.9438245479030396, + "flos": 572468594688.0, + "grad_norm": 0.05251720800952187, + "language_loss": 0.85584581, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86651874, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.28515625, + "step": 4906, + "time_per_iteration": 2.860182523727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.03592801, + "epoch": 0.9440169295883032, + "flos": 488018539008.0, + "grad_norm": 0.06069717631166294, + "language_loss": 0.81664246, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82728094, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.27905273, + "step": 4907, + "time_per_iteration": 2.527818202972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065327, + "balance_loss_mlp": 1.03717005, + "epoch": 0.9442093112735668, + "flos": 731399279616.0, + "grad_norm": 0.05466017438310119, + "language_loss": 0.7315473, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74220055, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.28173828, + "step": 4908, + "time_per_iteration": 2.998216390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067308, + "balance_loss_mlp": 1.03867412, + "epoch": 0.9444016929588304, + "flos": 570763404288.0, + "grad_norm": 0.0622325125694981, + "language_loss": 0.82240564, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83307874, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.28637695, + "step": 4909, + "time_per_iteration": 2.6756813526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106833, + "balance_loss_mlp": 1.03957677, + "epoch": 0.9445940746440938, + "flos": 509038663680.0, + "grad_norm": 0.06423421312294773, + "language_loss": 0.85805643, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86873972, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.28735352, + "step": 4910, + "time_per_iteration": 2.6203274726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065056, + "balance_loss_mlp": 1.03773332, + "epoch": 0.9447864563293574, + "flos": 624247077888.0, + "grad_norm": 0.0551617966572636, + "language_loss": 0.80864727, + "learning_rate": 7.97333876382028e-06, + "loss": 0.81929785, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.2734375, + "step": 4911, + "time_per_iteration": 2.824267864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_mlp": 1.03674865, + "epoch": 0.944978838014621, + "flos": 505011262464.0, + "grad_norm": 0.05688652138029108, + "language_loss": 0.80638492, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81703591, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.28344727, + "step": 4912, + "time_per_iteration": 2.713205575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006738, + "balance_loss_mlp": 0.99548489, + "epoch": 0.9451712196998846, + "flos": 1483371809280.0, + "grad_norm": 0.003561934378017574, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.7929408, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.11230469, + "step": 4913, + "time_per_iteration": 4.960860013961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066622, + "balance_loss_mlp": 1.03829789, + "epoch": 0.9453636013851482, + "flos": 520885140480.0, + "grad_norm": 0.07082362828499891, + "language_loss": 0.90111738, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91178358, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.28344727, + "step": 4914, + "time_per_iteration": 2.635700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007926, + "balance_loss_mlp": 0.99667293, + "epoch": 0.9455559830704117, + "flos": 1496060706816.0, + "grad_norm": 0.003268488212439821, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84570277, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.11230469, + "step": 4915, + "time_per_iteration": 4.9683918952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072161, + "balance_loss_mlp": 1.04393244, + "epoch": 0.9457483647556753, + "flos": 497871664128.0, + "grad_norm": 0.047809814034212056, + "language_loss": 0.81528771, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82600927, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.2824707, + "step": 4916, + "time_per_iteration": 2.639611005783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.0374043, + "epoch": 0.9459407464409388, + "flos": 1018979536896.0, + "grad_norm": 0.054134558028750085, + "language_loss": 0.82247525, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83312732, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.27807617, + "step": 4917, + "time_per_iteration": 3.4140350818634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065583, + "balance_loss_mlp": 1.03830767, + "epoch": 0.9461331281262024, + "flos": 513332315136.0, + "grad_norm": 0.0625234500918243, + "language_loss": 0.8119607, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82261658, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.27319336, + "step": 4918, + "time_per_iteration": 2.5912117958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065806, + "balance_loss_mlp": 1.03698182, + "epoch": 0.9463255098114659, + "flos": 527768663040.0, + "grad_norm": 0.06365242386186536, + "language_loss": 0.78204429, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79270232, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.2878418, + "step": 4919, + "time_per_iteration": 2.6376984119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.0388658, + "epoch": 0.9465178914967295, + "flos": 505798428672.0, + "grad_norm": 0.07305820868603019, + "language_loss": 0.83628333, + "learning_rate": 7.482341043430485e-06, + "loss": 0.8469547, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.28271484, + "step": 4920, + "time_per_iteration": 2.559981107711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060734, + "balance_loss_mlp": 1.03264809, + "epoch": 0.9467102731819931, + "flos": 659934895104.0, + "grad_norm": 0.055619804981278775, + "language_loss": 0.85643375, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86704111, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.28100586, + "step": 4921, + "time_per_iteration": 2.871453046798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.0357666, + "epoch": 0.9469026548672567, + "flos": 674854281216.0, + "grad_norm": 0.055827613473534016, + "language_loss": 0.89702082, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90766174, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.28295898, + "step": 4922, + "time_per_iteration": 2.930006980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064475, + "balance_loss_mlp": 1.03593636, + "epoch": 0.9470950365525203, + "flos": 513701872128.0, + "grad_norm": 0.062350678546594374, + "language_loss": 0.79667199, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80731678, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.28540039, + "step": 4923, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065466, + "balance_loss_mlp": 1.03783345, + "epoch": 0.9472874182377837, + "flos": 549823265280.0, + "grad_norm": 0.050108759523029664, + "language_loss": 0.81262392, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82327855, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.27661133, + "step": 4924, + "time_per_iteration": 2.7624218463897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.04025233, + "epoch": 0.9474797999230473, + "flos": 542510548992.0, + "grad_norm": 0.061582025232696735, + "language_loss": 0.79940867, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81008416, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.2734375, + "step": 4925, + "time_per_iteration": 2.63815975189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070421, + "balance_loss_mlp": 1.04262114, + "epoch": 0.9476721816083109, + "flos": 844291427328.0, + "grad_norm": 0.064300432731251, + "language_loss": 0.85653675, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.27807617, + "step": 4926, + "time_per_iteration": 3.1508045196533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063425, + "balance_loss_mlp": 1.03569698, + "epoch": 0.9478645632935745, + "flos": 637717349376.0, + "grad_norm": 0.059256067654064305, + "language_loss": 0.79014599, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80078024, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.27758789, + "step": 4927, + "time_per_iteration": 2.7674243450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067601, + "balance_loss_mlp": 1.03920519, + "epoch": 0.948056944978838, + "flos": 656531716608.0, + "grad_norm": 0.05522948680571442, + "language_loss": 0.75659686, + "learning_rate": 7.058900559793469e-06, + "loss": 0.76727289, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.28417969, + "step": 4928, + "time_per_iteration": 2.807382583618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067187, + "balance_loss_mlp": 1.03938746, + "epoch": 0.9482493266641016, + "flos": 440676301824.0, + "grad_norm": 0.061938965827223864, + "language_loss": 0.83113259, + "learning_rate": 7.00683148031378e-06, + "loss": 0.8418045, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.27832031, + "step": 4929, + "time_per_iteration": 2.523789882659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065424, + "balance_loss_mlp": 1.03771996, + "epoch": 0.9484417083493651, + "flos": 545707113984.0, + "grad_norm": 0.06503778908082132, + "language_loss": 0.77616841, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.78682268, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.27709961, + "step": 4930, + "time_per_iteration": 2.8400285243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067128, + "balance_loss_mlp": 1.03930449, + "epoch": 0.9486340900346287, + "flos": 538325996544.0, + "grad_norm": 0.049505853011934595, + "language_loss": 0.79665405, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80732536, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.27856445, + "step": 4931, + "time_per_iteration": 2.6870524883270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_mlp": 1.03999853, + "epoch": 0.9488264717198923, + "flos": 681362454528.0, + "grad_norm": 0.054233592359025716, + "language_loss": 0.85670519, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86739385, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.28881836, + "step": 4932, + "time_per_iteration": 2.857355833053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069295, + "balance_loss_mlp": 1.04092288, + "epoch": 0.9490188534051558, + "flos": 462365729280.0, + "grad_norm": 0.064073251907137, + "language_loss": 0.87887645, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.8895694, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.28369141, + "step": 4933, + "time_per_iteration": 2.540163993835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068966, + "balance_loss_mlp": 1.04067755, + "epoch": 0.9492112350904194, + "flos": 542865549312.0, + "grad_norm": 0.05372716069283064, + "language_loss": 0.82574224, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83643186, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.28259277, + "step": 4934, + "time_per_iteration": 2.6855287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063867, + "balance_loss_mlp": 1.0359726, + "epoch": 0.949403616775683, + "flos": 550040053248.0, + "grad_norm": 0.052069086168931376, + "language_loss": 0.8394295, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85006821, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.27929688, + "step": 4935, + "time_per_iteration": 2.7103271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106989, + "balance_loss_mlp": 1.0421617, + "epoch": 0.9495959984609466, + "flos": 598105437696.0, + "grad_norm": 0.05030032999954777, + "language_loss": 0.82814801, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83884692, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.27758789, + "step": 4936, + "time_per_iteration": 2.770634889602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065498, + "balance_loss_mlp": 1.0376029, + "epoch": 0.94978838014621, + "flos": 608130270720.0, + "grad_norm": 0.06038472870984303, + "language_loss": 0.82238394, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83303893, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.27929688, + "step": 4937, + "time_per_iteration": 2.81235408782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068864, + "balance_loss_mlp": 1.04106474, + "epoch": 0.9499807618314736, + "flos": 540575424000.0, + "grad_norm": 0.07012929733727388, + "language_loss": 0.86437929, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87506789, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.27832031, + "step": 4938, + "time_per_iteration": 2.6199066638946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066003, + "balance_loss_mlp": 1.03789318, + "epoch": 0.9501731435167372, + "flos": 594323937792.0, + "grad_norm": 0.0557301660975644, + "language_loss": 0.82896394, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83962405, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.28125, + "step": 4939, + "time_per_iteration": 2.7088263034820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064245, + "balance_loss_mlp": 1.0366118, + "epoch": 0.9503655252020008, + "flos": 453906464256.0, + "grad_norm": 0.057760924764302495, + "language_loss": 0.80044109, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81108356, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.27685547, + "step": 4940, + "time_per_iteration": 2.6036903858184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_mlp": 1.03814626, + "epoch": 0.9505579068872644, + "flos": 667649253888.0, + "grad_norm": 0.06216222313569856, + "language_loss": 0.84629482, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85696185, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.28540039, + "step": 4941, + "time_per_iteration": 2.8155128955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070131, + "balance_loss_mlp": 1.04249859, + "epoch": 0.9507502885725279, + "flos": 401989768704.0, + "grad_norm": 0.07429194359954051, + "language_loss": 0.81656432, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82726562, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.27661133, + "step": 4942, + "time_per_iteration": 2.4733738899230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064756, + "balance_loss_mlp": 1.03709936, + "epoch": 0.9509426702577914, + "flos": 700015288320.0, + "grad_norm": 0.05479314794150921, + "language_loss": 0.7956689, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80631644, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.27685547, + "step": 4943, + "time_per_iteration": 2.9666907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066084, + "balance_loss_mlp": 1.03816581, + "epoch": 0.951135051943055, + "flos": 501170125824.0, + "grad_norm": 0.05425923566819056, + "language_loss": 0.82572865, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83638954, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.27954102, + "step": 4944, + "time_per_iteration": 2.5886473655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.03998554, + "epoch": 0.9513274336283186, + "flos": 614310994944.0, + "grad_norm": 0.049572920738515824, + "language_loss": 0.81490457, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.8255862, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.28149414, + "step": 4945, + "time_per_iteration": 2.938873767852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.04088187, + "epoch": 0.9515198153135821, + "flos": 519334129152.0, + "grad_norm": 0.07213408654984042, + "language_loss": 0.81845057, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82913423, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.27490234, + "step": 4946, + "time_per_iteration": 2.5683889389038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067353, + "balance_loss_mlp": 1.03955328, + "epoch": 0.9517121969988457, + "flos": 664622834688.0, + "grad_norm": 0.05349359226162988, + "language_loss": 0.76608801, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77676153, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.27807617, + "step": 4947, + "time_per_iteration": 2.931286573410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067386, + "balance_loss_mlp": 1.0390383, + "epoch": 0.9519045786841093, + "flos": 676108518912.0, + "grad_norm": 0.0631556824358652, + "language_loss": 0.75756991, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76824379, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.28344727, + "step": 4948, + "time_per_iteration": 2.8119544982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.03739834, + "epoch": 0.9520969603693729, + "flos": 652593065472.0, + "grad_norm": 0.05176605196525789, + "language_loss": 0.80436432, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81501603, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.27832031, + "step": 4949, + "time_per_iteration": 2.8334755897521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068678, + "balance_loss_mlp": 1.04095006, + "epoch": 0.9522893420546364, + "flos": 742935836160.0, + "grad_norm": 0.04702530547499014, + "language_loss": 0.83160955, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84229636, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.27783203, + "step": 4950, + "time_per_iteration": 3.024099111557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066354, + "balance_loss_mlp": 1.0383395, + "epoch": 0.9524817237398999, + "flos": 761364679680.0, + "grad_norm": 0.055590433220462955, + "language_loss": 0.80557394, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81623745, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.28027344, + "step": 4951, + "time_per_iteration": 2.9261345863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063968, + "balance_loss_mlp": 1.03624022, + "epoch": 0.9526741054251635, + "flos": 528871541760.0, + "grad_norm": 0.06028024445787797, + "language_loss": 0.81832278, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.82896245, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.27783203, + "step": 4952, + "time_per_iteration": 2.601012706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067093, + "balance_loss_mlp": 1.03907871, + "epoch": 0.9528664871104271, + "flos": 488196039168.0, + "grad_norm": 0.08007516789791078, + "language_loss": 0.80964506, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82031596, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.28051758, + "step": 4953, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010662, + "balance_loss_mlp": 1.03804255, + "epoch": 0.9530588687956907, + "flos": 517464433152.0, + "grad_norm": 0.06660623394003432, + "language_loss": 0.85304189, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86370385, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.28198242, + "step": 4954, + "time_per_iteration": 2.7781050205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.03772521, + "epoch": 0.9532512504809542, + "flos": 674833932288.0, + "grad_norm": 0.056693610090972645, + "language_loss": 0.8034358, + "learning_rate": 5.720273340271864e-06, + "loss": 0.81409889, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.28588867, + "step": 4955, + "time_per_iteration": 2.8433279991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_mlp": 1.03934515, + "epoch": 0.9534436321662177, + "flos": 489269804544.0, + "grad_norm": 0.05291619762333268, + "language_loss": 0.83936781, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85004044, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.27905273, + "step": 4956, + "time_per_iteration": 2.5828912258148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066496, + "balance_loss_mlp": 1.03788543, + "epoch": 0.9536360138514813, + "flos": 496335209472.0, + "grad_norm": 0.06235854492095354, + "language_loss": 0.81562638, + "learning_rate": 5.626676233493167e-06, + "loss": 0.82629132, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.28613281, + "step": 4957, + "time_per_iteration": 2.669546127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066638, + "balance_loss_mlp": 1.03895712, + "epoch": 0.9538283955367449, + "flos": 801114803712.0, + "grad_norm": 0.053809767335559436, + "language_loss": 0.84141076, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85207713, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.27685547, + "step": 4958, + "time_per_iteration": 3.0569889545440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064573, + "balance_loss_mlp": 1.0356288, + "epoch": 0.9540207772220085, + "flos": 556386693120.0, + "grad_norm": 0.04933735095263698, + "language_loss": 0.79818612, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80883187, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.2890625, + "step": 4959, + "time_per_iteration": 2.7846500873565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066182, + "balance_loss_mlp": 1.03797746, + "epoch": 0.954213158907272, + "flos": 684193844736.0, + "grad_norm": 0.0573976228182319, + "language_loss": 0.81808335, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82874513, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.28198242, + "step": 4960, + "time_per_iteration": 2.916274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067802, + "balance_loss_mlp": 1.03940582, + "epoch": 0.9544055405925356, + "flos": 535480049664.0, + "grad_norm": 0.12489923707729335, + "language_loss": 0.82927817, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83995616, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.28393555, + "step": 4961, + "time_per_iteration": 2.715878486633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068036, + "balance_loss_mlp": 1.03980756, + "epoch": 0.9545979222777992, + "flos": 825025955328.0, + "grad_norm": 0.06585354044225371, + "language_loss": 0.80001307, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81069338, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.2824707, + "step": 4962, + "time_per_iteration": 3.1188926696777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070168, + "balance_loss_mlp": 1.04198718, + "epoch": 0.9547903039630627, + "flos": 761326801920.0, + "grad_norm": 0.06050362430741012, + "language_loss": 0.76945174, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78015339, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.28198242, + "step": 4963, + "time_per_iteration": 3.081576347351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_mlp": 1.041991, + "epoch": 0.9549826856483262, + "flos": 515050652160.0, + "grad_norm": 0.06425554688456968, + "language_loss": 0.82589138, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83658552, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.2746582, + "step": 4964, + "time_per_iteration": 2.5883357524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068863, + "balance_loss_mlp": 1.04189777, + "epoch": 0.9551750673335898, + "flos": 642818515968.0, + "grad_norm": 0.061836123206944746, + "language_loss": 0.82252514, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83321381, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.27001953, + "step": 4965, + "time_per_iteration": 2.807131767272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.03993297, + "epoch": 0.9553674490188534, + "flos": 471967160832.0, + "grad_norm": 0.05936300763457571, + "language_loss": 0.82923341, + "learning_rate": 5.214991993520546e-06, + "loss": 0.8399179, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.28491211, + "step": 4966, + "time_per_iteration": 2.5980896949768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.04150367, + "epoch": 0.955559830704117, + "flos": 528064026624.0, + "grad_norm": 0.08134141951074082, + "language_loss": 0.81711161, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82779801, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.27197266, + "step": 4967, + "time_per_iteration": 2.6179404258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064015, + "balance_loss_mlp": 1.0356431, + "epoch": 0.9557522123893806, + "flos": 547907079168.0, + "grad_norm": 0.06225562192484809, + "language_loss": 0.84138858, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85202879, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.28393555, + "step": 4968, + "time_per_iteration": 2.667945384979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066382, + "balance_loss_mlp": 1.0384872, + "epoch": 0.955944594074644, + "flos": 509201607168.0, + "grad_norm": 0.05278263186963013, + "language_loss": 0.81962323, + "learning_rate": 5.08122094572222e-06, + "loss": 0.8302871, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.27905273, + "step": 4969, + "time_per_iteration": 2.6727488040924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067, + "balance_loss_mlp": 1.03881836, + "epoch": 0.9561369757599076, + "flos": 527297209344.0, + "grad_norm": 0.052104090263610174, + "language_loss": 0.79543424, + "learning_rate": 5.037014862469824e-06, + "loss": 0.8061043, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.28198242, + "step": 4970, + "time_per_iteration": 2.760735511779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063905, + "balance_loss_mlp": 1.03610492, + "epoch": 0.9563293574451712, + "flos": 497950239744.0, + "grad_norm": 0.0557276302945241, + "language_loss": 0.80518448, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81582344, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.27807617, + "step": 4971, + "time_per_iteration": 2.605243444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008683, + "balance_loss_mlp": 0.99752527, + "epoch": 0.9565217391304348, + "flos": 1408160982528.0, + "grad_norm": 0.004764129085001868, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82782245, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.11181641, + "step": 4972, + "time_per_iteration": 4.912391901016235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064245, + "balance_loss_mlp": 1.03580165, + "epoch": 0.9567141208156984, + "flos": 503588289024.0, + "grad_norm": 0.05341140785738964, + "language_loss": 0.78160602, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79224843, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.28442383, + "step": 4973, + "time_per_iteration": 2.7303390502929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065708, + "balance_loss_mlp": 1.03771734, + "epoch": 0.9569065025009619, + "flos": 432985264128.0, + "grad_norm": 0.07164961386667579, + "language_loss": 0.79847026, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80912733, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.2800293, + "step": 4974, + "time_per_iteration": 2.531446933746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067013, + "balance_loss_mlp": 1.03947544, + "epoch": 0.9570988841862255, + "flos": 589662139392.0, + "grad_norm": 0.06516120913599614, + "language_loss": 0.78293043, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79360056, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.27563477, + "step": 4975, + "time_per_iteration": 2.7902753353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106743, + "balance_loss_mlp": 1.0391773, + "epoch": 0.957291265871489, + "flos": 766938710016.0, + "grad_norm": 0.06514295533680022, + "language_loss": 0.78948712, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.80016142, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.2824707, + "step": 4976, + "time_per_iteration": 3.0192434787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067896, + "balance_loss_mlp": 1.03983414, + "epoch": 0.9574836475567526, + "flos": 638820228096.0, + "grad_norm": 0.06668158886140403, + "language_loss": 0.844226, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85490495, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.28051758, + "step": 4977, + "time_per_iteration": 2.796856641769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010322, + "balance_loss_mlp": 0.99916387, + "epoch": 0.9576760292420161, + "flos": 1575077916672.0, + "grad_norm": 0.005308637901779806, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79617584, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.11181641, + "step": 4978, + "time_per_iteration": 4.921823978424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_mlp": 1.03912115, + "epoch": 0.9578684109272797, + "flos": 496089308160.0, + "grad_norm": 0.05441807345174081, + "language_loss": 0.87236488, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88304389, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.28759766, + "step": 4979, + "time_per_iteration": 2.70119047164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071898, + "balance_loss_mlp": 1.04264426, + "epoch": 0.9580607926125433, + "flos": 429730472448.0, + "grad_norm": 0.06759599092224589, + "language_loss": 0.85242122, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86314023, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.29223633, + "step": 4980, + "time_per_iteration": 2.492082357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064793, + "balance_loss_mlp": 1.03682661, + "epoch": 0.9582531742978069, + "flos": 1126796659200.0, + "grad_norm": 0.056689820580710266, + "language_loss": 0.79991627, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81056416, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.27978516, + "step": 4981, + "time_per_iteration": 3.57839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065826, + "balance_loss_mlp": 1.0384798, + "epoch": 0.9584455559830705, + "flos": 524185012224.0, + "grad_norm": 0.0491084118280761, + "language_loss": 0.79095042, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80160868, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.27392578, + "step": 4982, + "time_per_iteration": 2.6562139987945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067885, + "balance_loss_mlp": 1.04049063, + "epoch": 0.9586379376683339, + "flos": 633873240576.0, + "grad_norm": 0.05909810114288763, + "language_loss": 0.80548841, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81616724, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.27416992, + "step": 4983, + "time_per_iteration": 2.884284019470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064802, + "balance_loss_mlp": 1.03683555, + "epoch": 0.9588303193535975, + "flos": 415831007232.0, + "grad_norm": 0.06012496552815453, + "language_loss": 0.83002317, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84067118, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.2800293, + "step": 4984, + "time_per_iteration": 2.531792640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066004, + "balance_loss_mlp": 1.03756058, + "epoch": 0.9590227010388611, + "flos": 481440554496.0, + "grad_norm": 0.059119169486773586, + "language_loss": 0.77985901, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.790519, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.28442383, + "step": 4985, + "time_per_iteration": 2.565157651901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066791, + "balance_loss_mlp": 1.03853846, + "epoch": 0.9592150827241247, + "flos": 684214193664.0, + "grad_norm": 0.05981675805708547, + "language_loss": 0.80249083, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81315875, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.28271484, + "step": 4986, + "time_per_iteration": 2.948621988296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063768, + "balance_loss_mlp": 1.03556311, + "epoch": 0.9594074644093882, + "flos": 574205870592.0, + "grad_norm": 0.053606231340170674, + "language_loss": 0.71040821, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72104591, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.28222656, + "step": 4987, + "time_per_iteration": 2.713947296142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067964, + "balance_loss_mlp": 1.03992605, + "epoch": 0.9595998460946518, + "flos": 546593204736.0, + "grad_norm": 0.06105815634499886, + "language_loss": 0.78293216, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79361176, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.28051758, + "step": 4988, + "time_per_iteration": 2.7715773582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064589, + "balance_loss_mlp": 1.03650284, + "epoch": 0.9597922277799154, + "flos": 473798979072.0, + "grad_norm": 0.08864611353116542, + "language_loss": 0.78130996, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79195589, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.28100586, + "step": 4989, + "time_per_iteration": 2.594235897064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_mlp": 1.03401875, + "epoch": 0.9599846094651789, + "flos": 514435193856.0, + "grad_norm": 0.05622217854933262, + "language_loss": 0.8567155, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86733532, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.28027344, + "step": 4990, + "time_per_iteration": 2.6104650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063049, + "balance_loss_mlp": 1.03498709, + "epoch": 0.9601769911504425, + "flos": 594689112576.0, + "grad_norm": 0.08881428945062002, + "language_loss": 0.78393328, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79456377, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.28076172, + "step": 4991, + "time_per_iteration": 2.7895936965942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066773, + "balance_loss_mlp": 1.03847301, + "epoch": 0.960369372835706, + "flos": 492755940864.0, + "grad_norm": 0.04867717103170429, + "language_loss": 0.79372609, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80439377, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.28320312, + "step": 4992, + "time_per_iteration": 2.6212775707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066933, + "balance_loss_mlp": 1.03891885, + "epoch": 0.9605617545209696, + "flos": 579016055808.0, + "grad_norm": 0.05457191695661726, + "language_loss": 0.82460308, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83527243, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.28051758, + "step": 4993, + "time_per_iteration": 2.759636640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106556, + "balance_loss_mlp": 1.03795075, + "epoch": 0.9607541362062332, + "flos": 927312717312.0, + "grad_norm": 0.048885736648258904, + "language_loss": 0.86471546, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87537098, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.27661133, + "step": 4994, + "time_per_iteration": 3.2854697704315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070175, + "balance_loss_mlp": 1.04208946, + "epoch": 0.9609465178914968, + "flos": 572832359424.0, + "grad_norm": 0.05525276534284053, + "language_loss": 0.75332189, + "learning_rate": 3.994358637073036e-06, + "loss": 0.7640236, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.28100586, + "step": 4995, + "time_per_iteration": 2.8103957176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_mlp": 1.03918266, + "epoch": 0.9611388995767602, + "flos": 530585496576.0, + "grad_norm": 0.055668397628729924, + "language_loss": 0.85367101, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86434674, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.28393555, + "step": 4996, + "time_per_iteration": 2.683131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_mlp": 1.0309813, + "epoch": 0.9613312812620238, + "flos": 645959826432.0, + "grad_norm": 0.06526424456428359, + "language_loss": 0.82228351, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83286703, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.27416992, + "step": 4997, + "time_per_iteration": 2.7618255615234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066452, + "balance_loss_mlp": 1.03891504, + "epoch": 0.9615236629472874, + "flos": 495897251328.0, + "grad_norm": 0.05310297597854665, + "language_loss": 0.77744323, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78810775, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.27563477, + "step": 4998, + "time_per_iteration": 2.863664388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106706, + "balance_loss_mlp": 1.03799713, + "epoch": 0.961716044632551, + "flos": 512716856832.0, + "grad_norm": 0.0744319505918789, + "language_loss": 0.75606596, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76673657, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.29052734, + "step": 4999, + "time_per_iteration": 2.659785032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072002, + "balance_loss_mlp": 1.04420233, + "epoch": 0.9619084263178146, + "flos": 500835474432.0, + "grad_norm": 0.06374446062108947, + "language_loss": 0.8034153, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81413531, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.27832031, + "step": 5000, + "time_per_iteration": 2.5630085468292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063755, + "balance_loss_mlp": 1.03512073, + "epoch": 0.9621008080030781, + "flos": 595343858688.0, + "grad_norm": 0.08471732085322128, + "language_loss": 0.7496736, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76031113, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.28613281, + "step": 5001, + "time_per_iteration": 2.7929296493530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066628, + "balance_loss_mlp": 1.03811276, + "epoch": 0.9622931896883417, + "flos": 502002372096.0, + "grad_norm": 0.0587872194005596, + "language_loss": 0.82325351, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83391976, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.28491211, + "step": 5002, + "time_per_iteration": 2.629521131515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064966, + "balance_loss_mlp": 1.0366652, + "epoch": 0.9624855713736052, + "flos": 606998278656.0, + "grad_norm": 0.06325172707319822, + "language_loss": 0.80725789, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81790757, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.28320312, + "step": 5003, + "time_per_iteration": 2.839571952819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065227, + "balance_loss_mlp": 1.03747535, + "epoch": 0.9626779530588688, + "flos": 510461637120.0, + "grad_norm": 0.06727283899575592, + "language_loss": 0.84707081, + "learning_rate": 3.648452157695936e-06, + "loss": 0.85772312, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.27758789, + "step": 5004, + "time_per_iteration": 2.6041605472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010647, + "balance_loss_mlp": 1.03730631, + "epoch": 0.9628703347441323, + "flos": 626994100224.0, + "grad_norm": 0.055831199103682276, + "language_loss": 0.8231709, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.27441406, + "step": 5005, + "time_per_iteration": 2.8136613368988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066818, + "balance_loss_mlp": 1.03806448, + "epoch": 0.9630627164293959, + "flos": 630474444288.0, + "grad_norm": 0.05495272478085719, + "language_loss": 0.774104, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78477216, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.28735352, + "step": 5006, + "time_per_iteration": 2.769972324371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067484, + "balance_loss_mlp": 1.03994679, + "epoch": 0.9632550981146595, + "flos": 570267219456.0, + "grad_norm": 0.05396101102886816, + "language_loss": 0.78515279, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79582763, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.27587891, + "step": 5007, + "time_per_iteration": 2.833217144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_mlp": 1.03394079, + "epoch": 0.9634474797999231, + "flos": 465857657856.0, + "grad_norm": 0.05608554449489955, + "language_loss": 0.80852854, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81915593, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.28808594, + "step": 5008, + "time_per_iteration": 2.6398768424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064534, + "balance_loss_mlp": 1.03694844, + "epoch": 0.9636398614851867, + "flos": 526345689600.0, + "grad_norm": 0.05947227512115279, + "language_loss": 0.85232651, + "learning_rate": 3.463025724284974e-06, + "loss": 0.8629719, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.27612305, + "step": 5009, + "time_per_iteration": 2.6193339824676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.03592229, + "epoch": 0.9638322431704501, + "flos": 564554976768.0, + "grad_norm": 0.057419474894705454, + "language_loss": 0.75136191, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76200366, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.28271484, + "step": 5010, + "time_per_iteration": 2.8186190128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.03835249, + "epoch": 0.9640246248557137, + "flos": 477531016704.0, + "grad_norm": 0.11381050052221461, + "language_loss": 0.84410369, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85477066, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.28320312, + "step": 5011, + "time_per_iteration": 2.607623338699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065872, + "balance_loss_mlp": 1.03831065, + "epoch": 0.9642170065409773, + "flos": 539063700480.0, + "grad_norm": 0.057233359656352366, + "language_loss": 0.88353223, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89419091, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.27563477, + "step": 5012, + "time_per_iteration": 2.6468701362609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063888, + "balance_loss_mlp": 1.03625488, + "epoch": 0.9644093882262409, + "flos": 523499742720.0, + "grad_norm": 0.056318288112839024, + "language_loss": 0.83765054, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84828949, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.27636719, + "step": 5013, + "time_per_iteration": 2.6283926963806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.03994918, + "epoch": 0.9646017699115044, + "flos": 574018195968.0, + "grad_norm": 0.06680838012319379, + "language_loss": 0.78578639, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79646027, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.2746582, + "step": 5014, + "time_per_iteration": 2.757387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065697, + "balance_loss_mlp": 1.03832626, + "epoch": 0.964794151596768, + "flos": 636511163904.0, + "grad_norm": 0.06470310275941542, + "language_loss": 0.8431797, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85383666, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.27416992, + "step": 5015, + "time_per_iteration": 2.7526612281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_mlp": 1.03885484, + "epoch": 0.9649865332820315, + "flos": 617155531776.0, + "grad_norm": 0.07525409199590156, + "language_loss": 0.86100334, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87166679, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.27539062, + "step": 5016, + "time_per_iteration": 2.711585283279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.04113102, + "epoch": 0.9651789149672951, + "flos": 515898865152.0, + "grad_norm": 0.05644315482934111, + "language_loss": 0.8094486, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82014269, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.28295898, + "step": 5017, + "time_per_iteration": 2.783452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106516, + "balance_loss_mlp": 1.03750336, + "epoch": 0.9653712966525587, + "flos": 492696304128.0, + "grad_norm": 0.05618640768914361, + "language_loss": 0.79814726, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.80879885, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.27661133, + "step": 5018, + "time_per_iteration": 2.5714142322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066299, + "balance_loss_mlp": 1.03776038, + "epoch": 0.9655636783378222, + "flos": 536287564800.0, + "grad_norm": 0.3262600560454796, + "language_loss": 0.821886, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83254898, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.28540039, + "step": 5019, + "time_per_iteration": 2.7656137943267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_mlp": 1.03787422, + "epoch": 0.9657560600230858, + "flos": 458790842880.0, + "grad_norm": 0.07131607326554101, + "language_loss": 0.81939691, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83005363, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.27832031, + "step": 5020, + "time_per_iteration": 2.7424540519714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.03724039, + "epoch": 0.9659484417083494, + "flos": 685877276160.0, + "grad_norm": 0.06616301345736442, + "language_loss": 0.8344838, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84512877, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.27246094, + "step": 5021, + "time_per_iteration": 2.813933849334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009634, + "balance_loss_mlp": 0.99847621, + "epoch": 0.966140823393613, + "flos": 1501503879168.0, + "grad_norm": 0.004878091827342763, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81703877, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.11181641, + "step": 5022, + "time_per_iteration": 4.681534767150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066627, + "balance_loss_mlp": 1.03932786, + "epoch": 0.9663332050788765, + "flos": 464660236800.0, + "grad_norm": 0.0669391834816262, + "language_loss": 0.81136465, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.8220309, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.27319336, + "step": 5023, + "time_per_iteration": 2.6037216186523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071183, + "balance_loss_mlp": 1.04376459, + "epoch": 0.96652558676414, + "flos": 500575016448.0, + "grad_norm": 0.05770087966529414, + "language_loss": 0.85576534, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86647713, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.2746582, + "step": 5024, + "time_per_iteration": 2.648139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062992, + "balance_loss_mlp": 1.03476286, + "epoch": 0.9667179684494036, + "flos": 424614749184.0, + "grad_norm": 0.07347131630745982, + "language_loss": 0.82613868, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83676857, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.28222656, + "step": 5025, + "time_per_iteration": 2.4518802165985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067612, + "balance_loss_mlp": 1.03919196, + "epoch": 0.9669103501346672, + "flos": 516744258048.0, + "grad_norm": 0.056054793456989736, + "language_loss": 0.85796893, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86864507, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.28417969, + "step": 5026, + "time_per_iteration": 2.6519358158111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064191, + "balance_loss_mlp": 1.03581882, + "epoch": 0.9671027318199308, + "flos": 456008914944.0, + "grad_norm": 0.07661244422718277, + "language_loss": 0.75568247, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.7663244, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.28393555, + "step": 5027, + "time_per_iteration": 2.6021740436553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063493, + "balance_loss_mlp": 1.03574109, + "epoch": 0.9672951135051943, + "flos": 524809234944.0, + "grad_norm": 0.05603975558530982, + "language_loss": 0.79859215, + "learning_rate": 2.802372171957057e-06, + "loss": 0.80922711, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.27783203, + "step": 5028, + "time_per_iteration": 2.653294086456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062661, + "balance_loss_mlp": 1.03440905, + "epoch": 0.9674874951904578, + "flos": 573708275712.0, + "grad_norm": 0.05632883535344154, + "language_loss": 0.79708344, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80771005, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.2824707, + "step": 5029, + "time_per_iteration": 2.8485989570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064168, + "balance_loss_mlp": 1.03603494, + "epoch": 0.9676798768757214, + "flos": 628875380736.0, + "grad_norm": 0.05249570789540728, + "language_loss": 0.79920137, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80984306, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.28125, + "step": 5030, + "time_per_iteration": 2.9783546924591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008577, + "balance_loss_mlp": 0.99732375, + "epoch": 0.967872258560985, + "flos": 1463074831872.0, + "grad_norm": 0.0047902064985316075, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76571935, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.11230469, + "step": 5031, + "time_per_iteration": 4.6512672901153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_mlp": 1.04088974, + "epoch": 0.9680646402462486, + "flos": 565238836224.0, + "grad_norm": 0.06439989216377716, + "language_loss": 0.78775156, + "learning_rate": 2.672163531181049e-06, + "loss": 0.79843849, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.27832031, + "step": 5032, + "time_per_iteration": 2.711900234222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008099, + "balance_loss_mlp": 0.9968459, + "epoch": 0.9682570219315121, + "flos": 1433669635584.0, + "grad_norm": 0.004495052904339459, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79082906, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.11230469, + "step": 5033, + "time_per_iteration": 4.819545030593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064787, + "balance_loss_mlp": 1.03727293, + "epoch": 0.9684494036167757, + "flos": 584338392576.0, + "grad_norm": 0.0648336777486898, + "language_loss": 0.81837499, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82902288, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.27539062, + "step": 5034, + "time_per_iteration": 2.6948647499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066131, + "balance_loss_mlp": 1.03728223, + "epoch": 0.9686417853020393, + "flos": 558784507392.0, + "grad_norm": 0.09835762498909857, + "language_loss": 0.84009242, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85075378, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.28833008, + "step": 5035, + "time_per_iteration": 2.681332588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.03638136, + "epoch": 0.9688341669873028, + "flos": 784594944000.0, + "grad_norm": 0.07041823637158158, + "language_loss": 0.83102357, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84167081, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.28369141, + "step": 5036, + "time_per_iteration": 2.968900203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.03979087, + "epoch": 0.9690265486725663, + "flos": 395682416640.0, + "grad_norm": 0.058095180811742086, + "language_loss": 0.79474586, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80542266, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.27929688, + "step": 5037, + "time_per_iteration": 2.4735050201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070976, + "balance_loss_mlp": 1.0435822, + "epoch": 0.9692189303578299, + "flos": 476113835520.0, + "grad_norm": 0.06775264722732154, + "language_loss": 0.77614433, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78685409, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.27416992, + "step": 5038, + "time_per_iteration": 2.728487253189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063713, + "balance_loss_mlp": 1.0362711, + "epoch": 0.9694113120430935, + "flos": 597297922560.0, + "grad_norm": 0.06545146976604356, + "language_loss": 0.78883851, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79947555, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.27490234, + "step": 5039, + "time_per_iteration": 2.747343063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061735, + "balance_loss_mlp": 1.03448391, + "epoch": 0.9696036937283571, + "flos": 500628860928.0, + "grad_norm": 0.05842492714952315, + "language_loss": 0.82463217, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83524954, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.27294922, + "step": 5040, + "time_per_iteration": 2.5741090774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.036937, + "epoch": 0.9697960754136207, + "flos": 432049711104.0, + "grad_norm": 0.06899414406463689, + "language_loss": 0.87255681, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88320959, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.28369141, + "step": 5041, + "time_per_iteration": 2.4628825187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069848, + "balance_loss_mlp": 1.04042697, + "epoch": 0.9699884570988841, + "flos": 568257901056.0, + "grad_norm": 0.050873067640045296, + "language_loss": 0.85379595, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86449444, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.29418945, + "step": 5042, + "time_per_iteration": 2.7318944931030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069353, + "balance_loss_mlp": 1.04193473, + "epoch": 0.9701808387841477, + "flos": 515961474048.0, + "grad_norm": 0.06956938188967421, + "language_loss": 0.81409943, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82479298, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.27441406, + "step": 5043, + "time_per_iteration": 2.65865421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065727, + "balance_loss_mlp": 1.0380702, + "epoch": 0.9703732204694113, + "flos": 491273330688.0, + "grad_norm": 0.07297009392614884, + "language_loss": 0.75900912, + "learning_rate": 2.300076399000206e-06, + "loss": 0.76966637, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.27661133, + "step": 5044, + "time_per_iteration": 2.5922508239746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064819, + "balance_loss_mlp": 1.03687608, + "epoch": 0.9705656021546749, + "flos": 625831584768.0, + "grad_norm": 0.058526336154578064, + "language_loss": 0.79872143, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.80936968, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.27978516, + "step": 5045, + "time_per_iteration": 2.785860061645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066292, + "balance_loss_mlp": 1.03796744, + "epoch": 0.9707579838399384, + "flos": 471198933504.0, + "grad_norm": 0.05613638274130696, + "language_loss": 0.82710165, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83776456, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.28320312, + "step": 5046, + "time_per_iteration": 2.6305992603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063938, + "balance_loss_mlp": 1.0362339, + "epoch": 0.970950365525202, + "flos": 491845118976.0, + "grad_norm": 0.08794096275346511, + "language_loss": 0.80495691, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81559622, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.27709961, + "step": 5047, + "time_per_iteration": 2.671323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.03623736, + "epoch": 0.9711427472104656, + "flos": 557060378112.0, + "grad_norm": 0.060777831648666195, + "language_loss": 0.80575037, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81639123, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.27880859, + "step": 5048, + "time_per_iteration": 2.6912620067596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067773, + "balance_loss_mlp": 1.04013991, + "epoch": 0.9713351288957291, + "flos": 625527456768.0, + "grad_norm": 0.05303633777946519, + "language_loss": 0.8379271, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84860486, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.27661133, + "step": 5049, + "time_per_iteration": 2.930760622024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067092, + "balance_loss_mlp": 1.04017484, + "epoch": 0.9715275105809927, + "flos": 498821773824.0, + "grad_norm": 0.05437490593151225, + "language_loss": 0.80818999, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.81886101, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.26977539, + "step": 5050, + "time_per_iteration": 2.7364494800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.03714252, + "epoch": 0.9717198922662562, + "flos": 477274940928.0, + "grad_norm": 0.06615210996370888, + "language_loss": 0.77408063, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78472841, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.27685547, + "step": 5051, + "time_per_iteration": 2.6002469062805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064465, + "balance_loss_mlp": 1.03578305, + "epoch": 0.9719122739515198, + "flos": 553171189248.0, + "grad_norm": 0.05273950962157412, + "language_loss": 0.78674865, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79739332, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.28710938, + "step": 5052, + "time_per_iteration": 2.708221197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062301, + "balance_loss_mlp": 1.03540766, + "epoch": 0.9721046556367834, + "flos": 565583662080.0, + "grad_norm": 0.2131840589845717, + "language_loss": 0.79749233, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80811536, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.26940918, + "step": 5053, + "time_per_iteration": 2.6769378185272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067739, + "balance_loss_mlp": 1.03965354, + "epoch": 0.972297037322047, + "flos": 560044537344.0, + "grad_norm": 0.06115965069395946, + "language_loss": 0.7824676, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79314494, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.28051758, + "step": 5054, + "time_per_iteration": 2.780709981918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064914, + "balance_loss_mlp": 1.03682876, + "epoch": 0.9724894190073105, + "flos": 512175591936.0, + "grad_norm": 0.06000493777868893, + "language_loss": 0.79179239, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.8024416, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.28100586, + "step": 5055, + "time_per_iteration": 2.679389238357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03743291, + "epoch": 0.972681800692574, + "flos": 613532593152.0, + "grad_norm": 0.09595549516744886, + "language_loss": 0.80428839, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81494069, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.27832031, + "step": 5056, + "time_per_iteration": 2.790769338607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.03927755, + "epoch": 0.9728741823778376, + "flos": 833562385920.0, + "grad_norm": 0.05624518415206626, + "language_loss": 0.83850849, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84918284, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.28149414, + "step": 5057, + "time_per_iteration": 3.0661802291870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_mlp": 1.03324711, + "epoch": 0.9730665640631012, + "flos": 571167866880.0, + "grad_norm": 0.061543355235248995, + "language_loss": 0.84603822, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85666203, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.29077148, + "step": 5058, + "time_per_iteration": 2.7378501892089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063259, + "balance_loss_mlp": 1.03510189, + "epoch": 0.9732589457483648, + "flos": 506271292416.0, + "grad_norm": 0.05909256512343959, + "language_loss": 0.7731396, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78377223, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.28173828, + "step": 5059, + "time_per_iteration": 2.597114324569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067045, + "balance_loss_mlp": 1.039222, + "epoch": 0.9734513274336283, + "flos": 926602716672.0, + "grad_norm": 0.05633726130728, + "language_loss": 0.80202436, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81269479, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.27856445, + "step": 5060, + "time_per_iteration": 3.1069252490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066984, + "balance_loss_mlp": 1.0390172, + "epoch": 0.9736437091188919, + "flos": 491682175488.0, + "grad_norm": 0.04955029488996079, + "language_loss": 0.78345102, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.2800293, + "step": 5061, + "time_per_iteration": 2.7237606048583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.04027295, + "epoch": 0.9738360908041555, + "flos": 613039380480.0, + "grad_norm": 0.0670398565669916, + "language_loss": 0.83701253, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84769392, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.27856445, + "step": 5062, + "time_per_iteration": 2.705859422683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008012, + "balance_loss_mlp": 0.99685371, + "epoch": 0.974028472489419, + "flos": 1548771922944.0, + "grad_norm": 0.004487982197495449, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77000105, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.11181641, + "step": 5063, + "time_per_iteration": 4.964916229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008017, + "balance_loss_mlp": 0.99685866, + "epoch": 0.9742208541746825, + "flos": 1410403055616.0, + "grad_norm": 0.004488989410680284, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80685735, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.11181641, + "step": 5064, + "time_per_iteration": 4.9454896450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065951, + "balance_loss_mlp": 1.03760338, + "epoch": 0.9744132358599461, + "flos": 674582238720.0, + "grad_norm": 0.047234166285075166, + "language_loss": 0.76724768, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77790713, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.28344727, + "step": 5065, + "time_per_iteration": 2.85241961479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063551, + "balance_loss_mlp": 1.03520298, + "epoch": 0.9746056175452097, + "flos": 598111229952.0, + "grad_norm": 0.05284767537793641, + "language_loss": 0.77460915, + "learning_rate": 1.690196122544896e-06, + "loss": 0.7852447, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.28344727, + "step": 5066, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106678, + "balance_loss_mlp": 1.03852713, + "epoch": 0.9747979992304733, + "flos": 731837237760.0, + "grad_norm": 0.056939987463815324, + "language_loss": 0.82497215, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83563995, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.2824707, + "step": 5067, + "time_per_iteration": 2.985030174255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069279, + "balance_loss_mlp": 1.04045403, + "epoch": 0.9749903809157369, + "flos": 616219978752.0, + "grad_norm": 0.06983302671327438, + "language_loss": 0.76487023, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.775563, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.28808594, + "step": 5068, + "time_per_iteration": 2.6938107013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106283, + "balance_loss_mlp": 1.03483963, + "epoch": 0.9751827626010003, + "flos": 468160929792.0, + "grad_norm": 0.05087339231856929, + "language_loss": 0.83533263, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84596097, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.2800293, + "step": 5069, + "time_per_iteration": 2.602464437484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_mlp": 1.04174674, + "epoch": 0.9753751442862639, + "flos": 598918745088.0, + "grad_norm": 0.06914796858468633, + "language_loss": 0.85062265, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86132151, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.28149414, + "step": 5070, + "time_per_iteration": 2.7954771518707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066097, + "balance_loss_mlp": 1.03777337, + "epoch": 0.9755675259715275, + "flos": 650486232576.0, + "grad_norm": 0.051994743985587635, + "language_loss": 0.82142699, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83208799, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.28344727, + "step": 5071, + "time_per_iteration": 2.8875765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063694, + "balance_loss_mlp": 1.03646636, + "epoch": 0.9757599076567911, + "flos": 563392461312.0, + "grad_norm": 0.10792269115759393, + "language_loss": 0.79117143, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80180836, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.27294922, + "step": 5072, + "time_per_iteration": 2.6715874671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.03808391, + "epoch": 0.9759522893420547, + "flos": 504385629696.0, + "grad_norm": 0.061963410624624696, + "language_loss": 0.80203068, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81269693, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.28540039, + "step": 5073, + "time_per_iteration": 2.589348316192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062771, + "balance_loss_mlp": 1.03513861, + "epoch": 0.9761446710273182, + "flos": 583452301824.0, + "grad_norm": 0.07055046629147509, + "language_loss": 0.81962037, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83024812, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.27636719, + "step": 5074, + "time_per_iteration": 2.688120126724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_mlp": 1.03256583, + "epoch": 0.9763370527125818, + "flos": 481967262720.0, + "grad_norm": 0.0708195540805075, + "language_loss": 0.82147515, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83208144, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.28051758, + "step": 5075, + "time_per_iteration": 2.5922911167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065842, + "balance_loss_mlp": 1.03761315, + "epoch": 0.9765294343978453, + "flos": 618706543104.0, + "grad_norm": 0.050628843291049455, + "language_loss": 0.78722847, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79788685, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.28222656, + "step": 5076, + "time_per_iteration": 2.792860746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106741, + "balance_loss_mlp": 1.03901386, + "epoch": 0.9767218160831089, + "flos": 526320958464.0, + "grad_norm": 0.055930544938087315, + "language_loss": 0.84628701, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.85696107, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.28417969, + "step": 5077, + "time_per_iteration": 2.5855977535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061522, + "balance_loss_mlp": 1.03350818, + "epoch": 0.9769141977683724, + "flos": 524932890624.0, + "grad_norm": 0.06238733542479722, + "language_loss": 0.83731985, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.84793508, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.28027344, + "step": 5078, + "time_per_iteration": 2.65868878364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068059, + "balance_loss_mlp": 1.03925812, + "epoch": 0.977106579453636, + "flos": 457359105024.0, + "grad_norm": 0.06822539995554136, + "language_loss": 0.80723315, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81791377, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.28759766, + "step": 5079, + "time_per_iteration": 2.923542022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062951, + "balance_loss_mlp": 1.03467441, + "epoch": 0.9772989611388996, + "flos": 531830969856.0, + "grad_norm": 0.06212480282272471, + "language_loss": 0.8100605, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82069004, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.28271484, + "step": 5080, + "time_per_iteration": 2.5942180156707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064744, + "balance_loss_mlp": 1.03665841, + "epoch": 0.9774913428241632, + "flos": 754999100928.0, + "grad_norm": 0.06018369502974455, + "language_loss": 0.85829055, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.86893803, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.28076172, + "step": 5081, + "time_per_iteration": 3.0458836555480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007999, + "balance_loss_mlp": 0.99684066, + "epoch": 0.9776837245094268, + "flos": 1553486003712.0, + "grad_norm": 0.004489173292933679, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79903495, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.11181641, + "step": 5082, + "time_per_iteration": 4.958382844924927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_mlp": 1.0406152, + "epoch": 0.9778761061946902, + "flos": 592260774912.0, + "grad_norm": 0.07104268119711042, + "language_loss": 0.83867311, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.84935224, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.2734375, + "step": 5083, + "time_per_iteration": 2.6874279975891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_mlp": 1.03678381, + "epoch": 0.9780684878799538, + "flos": 414732510720.0, + "grad_norm": 0.05557596006136318, + "language_loss": 0.81895953, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82960916, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.28173828, + "step": 5084, + "time_per_iteration": 2.479989767074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064918, + "balance_loss_mlp": 1.03676116, + "epoch": 0.9782608695652174, + "flos": 568129863168.0, + "grad_norm": 0.06367156527837714, + "language_loss": 0.84807253, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85872167, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.28173828, + "step": 5085, + "time_per_iteration": 2.7060656547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.04272389, + "epoch": 0.978453251250481, + "flos": 690151988736.0, + "grad_norm": 0.05907609333285634, + "language_loss": 0.82935727, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84006536, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.28076172, + "step": 5086, + "time_per_iteration": 2.8707125186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063057, + "balance_loss_mlp": 1.03611541, + "epoch": 0.9786456329357445, + "flos": 502239508992.0, + "grad_norm": 0.05371323925728747, + "language_loss": 0.77593768, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78656816, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.26977539, + "step": 5087, + "time_per_iteration": 2.6420705318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063721, + "balance_loss_mlp": 1.03577852, + "epoch": 0.9788380146210081, + "flos": 862829369856.0, + "grad_norm": 0.06417447661678208, + "language_loss": 0.8063373, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81697452, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.27978516, + "step": 5088, + "time_per_iteration": 3.069293737411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106397, + "balance_loss_mlp": 1.03669453, + "epoch": 0.9790303963062716, + "flos": 512460781056.0, + "grad_norm": 0.06576556043594046, + "language_loss": 0.8408463, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85148597, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.27319336, + "step": 5089, + "time_per_iteration": 2.5954997539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_mlp": 1.03673351, + "epoch": 0.9792227779915352, + "flos": 494183296512.0, + "grad_norm": 0.06743703328576649, + "language_loss": 0.86218363, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87282586, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.27539062, + "step": 5090, + "time_per_iteration": 2.5840415954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_mlp": 1.03685737, + "epoch": 0.9794151596767988, + "flos": 608037138432.0, + "grad_norm": 0.06524237500562691, + "language_loss": 0.81397247, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82461935, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.27856445, + "step": 5091, + "time_per_iteration": 2.840353012084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064016, + "balance_loss_mlp": 1.0356915, + "epoch": 0.9796075413620623, + "flos": 477979149312.0, + "grad_norm": 0.062315389345704714, + "language_loss": 0.86601949, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87665963, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.28320312, + "step": 5092, + "time_per_iteration": 2.6200978755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066682, + "balance_loss_mlp": 1.03800082, + "epoch": 0.9797999230473259, + "flos": 591936297984.0, + "grad_norm": 0.06641731956014876, + "language_loss": 0.84266961, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85333645, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.28686523, + "step": 5093, + "time_per_iteration": 2.746304750442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064074, + "balance_loss_mlp": 1.03651321, + "epoch": 0.9799923047325895, + "flos": 556086947328.0, + "grad_norm": 0.05417277505387902, + "language_loss": 0.81640154, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82704222, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.27612305, + "step": 5094, + "time_per_iteration": 2.923612356185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066797, + "balance_loss_mlp": 1.03766191, + "epoch": 0.9801846864178531, + "flos": 579164442624.0, + "grad_norm": 0.051233862308683015, + "language_loss": 0.84678006, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85744798, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.29101562, + "step": 5095, + "time_per_iteration": 2.808309316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066717, + "balance_loss_mlp": 1.03941762, + "epoch": 0.9803770681031165, + "flos": 514825099776.0, + "grad_norm": 0.07168178318211277, + "language_loss": 0.79702234, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80768943, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.2734375, + "step": 5096, + "time_per_iteration": 2.6654510498046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064751, + "balance_loss_mlp": 1.03690398, + "epoch": 0.9805694497883801, + "flos": 566706889728.0, + "grad_norm": 0.0522945877997543, + "language_loss": 0.78104866, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79169619, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.27880859, + "step": 5097, + "time_per_iteration": 2.7318952083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106322, + "balance_loss_mlp": 1.03561127, + "epoch": 0.9807618314736437, + "flos": 479106759168.0, + "grad_norm": 0.05974567880983708, + "language_loss": 0.73509181, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74572396, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.27636719, + "step": 5098, + "time_per_iteration": 2.634428024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066119, + "balance_loss_mlp": 1.03765178, + "epoch": 0.9809542131589073, + "flos": 545021844480.0, + "grad_norm": 0.0637255746549638, + "language_loss": 0.80092281, + "learning_rate": 9.509698444908344e-07, + "loss": 0.811584, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.28466797, + "step": 5099, + "time_per_iteration": 2.6950488090515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106464, + "balance_loss_mlp": 1.0366019, + "epoch": 0.9811465948441709, + "flos": 520589776896.0, + "grad_norm": 0.07183511235342367, + "language_loss": 0.79666537, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80731177, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.28076172, + "step": 5100, + "time_per_iteration": 2.612643003463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_mlp": 1.03431392, + "epoch": 0.9813389765294344, + "flos": 541023556608.0, + "grad_norm": 0.05365704847096246, + "language_loss": 0.79934072, + "learning_rate": 9.129465107554635e-07, + "loss": 0.80996406, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.28051758, + "step": 5101, + "time_per_iteration": 2.675701856613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063332, + "balance_loss_mlp": 1.03608108, + "epoch": 0.981531358214698, + "flos": 567080828928.0, + "grad_norm": 0.053968731352124745, + "language_loss": 0.84537339, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85600674, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.27294922, + "step": 5102, + "time_per_iteration": 2.696805000305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068977, + "balance_loss_mlp": 1.04117751, + "epoch": 0.9817237398999615, + "flos": 576987798528.0, + "grad_norm": 0.04658045278323515, + "language_loss": 0.81048197, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82117176, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.27832031, + "step": 5103, + "time_per_iteration": 2.717839241027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063782, + "balance_loss_mlp": 1.03536224, + "epoch": 0.9819161215852251, + "flos": 701172011520.0, + "grad_norm": 0.05020668582678838, + "language_loss": 0.81720734, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82784516, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.28417969, + "step": 5104, + "time_per_iteration": 2.9835586547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068728, + "balance_loss_mlp": 1.04111898, + "epoch": 0.9821085032704886, + "flos": 623873138688.0, + "grad_norm": 0.056311692471421905, + "language_loss": 0.84119457, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85188186, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.27636719, + "step": 5105, + "time_per_iteration": 2.865739345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066478, + "balance_loss_mlp": 1.03872645, + "epoch": 0.9823008849557522, + "flos": 499259731968.0, + "grad_norm": 0.0633138190007986, + "language_loss": 0.81195086, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82261562, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.27807617, + "step": 5106, + "time_per_iteration": 2.659518241882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.03788257, + "epoch": 0.9824932666410158, + "flos": 523544822784.0, + "grad_norm": 0.07527269875681585, + "language_loss": 0.72561419, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73627502, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.28173828, + "step": 5107, + "time_per_iteration": 2.6718273162841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064766, + "balance_loss_mlp": 1.03632259, + "epoch": 0.9826856483262794, + "flos": 502411216896.0, + "grad_norm": 0.05421046095674237, + "language_loss": 0.8271212, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83776885, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.28442383, + "step": 5108, + "time_per_iteration": 2.67730712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068766, + "balance_loss_mlp": 1.04068017, + "epoch": 0.982878030011543, + "flos": 561768666624.0, + "grad_norm": 0.05263815336663701, + "language_loss": 0.84345829, + "learning_rate": 7.686042586151354e-07, + "loss": 0.854146, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.28125, + "step": 5109, + "time_per_iteration": 2.8377928733825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_mlp": 1.03690767, + "epoch": 0.9830704116968064, + "flos": 536824447488.0, + "grad_norm": 0.05247784776124124, + "language_loss": 0.827075, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83771992, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.27612305, + "step": 5110, + "time_per_iteration": 2.7975401878356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065466, + "balance_loss_mlp": 1.03649783, + "epoch": 0.98326279338207, + "flos": 458712267264.0, + "grad_norm": 0.09015714109116883, + "language_loss": 0.83963883, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85029346, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.28955078, + "step": 5111, + "time_per_iteration": 2.629821538925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064126, + "balance_loss_mlp": 1.03639817, + "epoch": 0.9834551750673336, + "flos": 640672395264.0, + "grad_norm": 0.0626478349182541, + "language_loss": 0.79414022, + "learning_rate": 7.17673735218416e-07, + "loss": 0.8047815, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.27758789, + "step": 5112, + "time_per_iteration": 2.8147006034851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066315, + "balance_loss_mlp": 1.03918338, + "epoch": 0.9836475567525972, + "flos": 1071373478400.0, + "grad_norm": 0.05679930986703107, + "language_loss": 0.79416007, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80482322, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.27172852, + "step": 5113, + "time_per_iteration": 3.4686007499694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066539, + "balance_loss_mlp": 1.03871512, + "epoch": 0.9838399384378607, + "flos": 564943472640.0, + "grad_norm": 0.0695922230285022, + "language_loss": 0.76262808, + "learning_rate": 6.846892349181566e-07, + "loss": 0.7732935, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.27832031, + "step": 5114, + "time_per_iteration": 2.670605421066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067777, + "balance_loss_mlp": 1.03995383, + "epoch": 0.9840323201231242, + "flos": 772463278080.0, + "grad_norm": 0.060418718595467394, + "language_loss": 0.79443765, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80511546, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.27880859, + "step": 5115, + "time_per_iteration": 2.9948134422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065258, + "balance_loss_mlp": 1.03714883, + "epoch": 0.9842247018083878, + "flos": 472016623104.0, + "grad_norm": 0.06012665719169644, + "language_loss": 0.85382408, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86447668, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.28125, + "step": 5116, + "time_per_iteration": 2.572911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068184, + "balance_loss_mlp": 1.03983617, + "epoch": 0.9844170834936514, + "flos": 524996909568.0, + "grad_norm": 0.05269181316920123, + "language_loss": 0.8446027, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85528451, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.28369141, + "step": 5117, + "time_per_iteration": 2.621640682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007971, + "balance_loss_mlp": 0.99681312, + "epoch": 0.984609465178915, + "flos": 1566406245888.0, + "grad_norm": 0.00448658618358924, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.7817049, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.11181641, + "step": 5118, + "time_per_iteration": 4.9339916706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068461, + "balance_loss_mlp": 1.04085207, + "epoch": 0.9848018468641785, + "flos": 519294841344.0, + "grad_norm": 0.07090239298528411, + "language_loss": 0.81994283, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83062744, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.27661133, + "step": 5119, + "time_per_iteration": 2.609553337097168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066341, + "balance_loss_mlp": 1.0383265, + "epoch": 0.9849942285494421, + "flos": 492760323072.0, + "grad_norm": 0.06400427825607695, + "language_loss": 0.83007431, + "learning_rate": 5.903883659301167e-07, + "loss": 0.8407377, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.28027344, + "step": 5120, + "time_per_iteration": 2.5743188858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062999, + "balance_loss_mlp": 1.03477073, + "epoch": 0.9851866102347057, + "flos": 545740609536.0, + "grad_norm": 0.07031157312266538, + "language_loss": 0.80597335, + "learning_rate": 5.753501275193029e-07, + "loss": 0.81660336, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.2824707, + "step": 5121, + "time_per_iteration": 2.6467745304107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064653, + "balance_loss_mlp": 1.03656745, + "epoch": 0.9853789919199692, + "flos": 476019293184.0, + "grad_norm": 0.08630519258977648, + "language_loss": 0.80286318, + "learning_rate": 5.605057829531912e-07, + "loss": 0.8135097, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.28100586, + "step": 5122, + "time_per_iteration": 2.5414161682128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.03586268, + "epoch": 0.9855713736052328, + "flos": 1032199524864.0, + "grad_norm": 0.07104979661639406, + "language_loss": 0.75887775, + "learning_rate": 5.458553379950049e-07, + "loss": 0.76951861, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.2824707, + "step": 5123, + "time_per_iteration": 3.3723208904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068718, + "balance_loss_mlp": 1.04125214, + "epoch": 0.9857637552904963, + "flos": 494794372608.0, + "grad_norm": 0.05613599300487702, + "language_loss": 0.82546532, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83615249, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.27490234, + "step": 5124, + "time_per_iteration": 2.6383025646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067982, + "balance_loss_mlp": 1.03984904, + "epoch": 0.9859561369757599, + "flos": 591990142464.0, + "grad_norm": 0.06759103008670121, + "language_loss": 0.83886242, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84954226, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.28149414, + "step": 5125, + "time_per_iteration": 2.72212553024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.0372963, + "epoch": 0.9861485186610235, + "flos": 486719221248.0, + "grad_norm": 0.06666021262428576, + "language_loss": 0.78677505, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79741907, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.27148438, + "step": 5126, + "time_per_iteration": 2.6846718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066171, + "balance_loss_mlp": 1.03858638, + "epoch": 0.9863409003462871, + "flos": 518536788480.0, + "grad_norm": 0.058638408725860694, + "language_loss": 0.82465839, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83532012, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.27661133, + "step": 5127, + "time_per_iteration": 2.7298150062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008067, + "balance_loss_mlp": 0.99681342, + "epoch": 0.9865332820315506, + "flos": 1485212391936.0, + "grad_norm": 0.004487140552061121, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80190802, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.11230469, + "step": 5128, + "time_per_iteration": 4.89987587928772 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066434, + "balance_loss_mlp": 1.03872991, + "epoch": 0.9867256637168141, + "flos": 581837271552.0, + "grad_norm": 0.05732406131486392, + "language_loss": 0.78990746, + "learning_rate": 4.620248732582488e-07, + "loss": 0.8005718, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.27734375, + "step": 5129, + "time_per_iteration": 2.705324649810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03869843, + "epoch": 0.9869180454020777, + "flos": 958898939904.0, + "grad_norm": 0.05618301860540118, + "language_loss": 0.86019075, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87085474, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.27758789, + "step": 5130, + "time_per_iteration": 3.2497148513793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068506, + "balance_loss_mlp": 1.04037285, + "epoch": 0.9871104270873413, + "flos": 770385558528.0, + "grad_norm": 0.06199778445898079, + "language_loss": 0.82707268, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83775777, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.28173828, + "step": 5131, + "time_per_iteration": 2.9684877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_mlp": 1.03326333, + "epoch": 0.9873028087726049, + "flos": 446215426560.0, + "grad_norm": 0.08982074856332944, + "language_loss": 0.77832627, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.78894454, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.28564453, + "step": 5132, + "time_per_iteration": 2.482541084289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064048, + "balance_loss_mlp": 1.03629649, + "epoch": 0.9874951904578684, + "flos": 507359614464.0, + "grad_norm": 0.06533508581456446, + "language_loss": 0.86547804, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87611854, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.27758789, + "step": 5133, + "time_per_iteration": 2.606316089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106598, + "balance_loss_mlp": 1.03815663, + "epoch": 0.987687572143132, + "flos": 716420256768.0, + "grad_norm": 0.05462139219845756, + "language_loss": 0.82088351, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83154333, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.27832031, + "step": 5134, + "time_per_iteration": 2.9544055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007967, + "balance_loss_mlp": 0.99680901, + "epoch": 0.9878799538283956, + "flos": 1537823121408.0, + "grad_norm": 0.004485925131120654, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80825919, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.11181641, + "step": 5135, + "time_per_iteration": 4.916072368621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04330945, + "epoch": 0.9880723355136591, + "flos": 721098021888.0, + "grad_norm": 0.05079199609455142, + "language_loss": 0.81718385, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82789761, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.28076172, + "step": 5136, + "time_per_iteration": 2.918941020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064402, + "balance_loss_mlp": 1.03743672, + "epoch": 0.9882647171989226, + "flos": 557085109248.0, + "grad_norm": 0.04769574235406856, + "language_loss": 0.84221953, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85286361, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.27001953, + "step": 5137, + "time_per_iteration": 2.659917116165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069145, + "balance_loss_mlp": 1.04108286, + "epoch": 0.9884570988841862, + "flos": 562541276160.0, + "grad_norm": 0.06700563698780587, + "language_loss": 0.80401492, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81470633, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.28076172, + "step": 5138, + "time_per_iteration": 2.7219605445861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.03707492, + "epoch": 0.9886494805694498, + "flos": 431537559552.0, + "grad_norm": 0.05937359119861329, + "language_loss": 0.861534, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87219155, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.28662109, + "step": 5139, + "time_per_iteration": 2.475071907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.03777456, + "epoch": 0.9888418622547134, + "flos": 591793703424.0, + "grad_norm": 0.06601069931668228, + "language_loss": 0.90451717, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91517955, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.28442383, + "step": 5140, + "time_per_iteration": 2.7885544300079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068428, + "balance_loss_mlp": 1.04008031, + "epoch": 0.989034243939977, + "flos": 1134526984704.0, + "grad_norm": 0.05211766509967625, + "language_loss": 0.79793286, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.80861717, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.28369141, + "step": 5141, + "time_per_iteration": 3.5274829864501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.03997421, + "epoch": 0.9892266256252404, + "flos": 566405733888.0, + "grad_norm": 0.06930719912471439, + "language_loss": 0.82036865, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83105016, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.28173828, + "step": 5142, + "time_per_iteration": 2.703993320465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066028, + "balance_loss_mlp": 1.03787088, + "epoch": 0.989419007310504, + "flos": 640254786048.0, + "grad_norm": 0.055317623782820756, + "language_loss": 0.83511734, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.84577763, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.28198242, + "step": 5143, + "time_per_iteration": 2.893228530883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066108, + "balance_loss_mlp": 1.03823721, + "epoch": 0.9896113889957676, + "flos": 455236305408.0, + "grad_norm": 0.08393848861396483, + "language_loss": 0.81569672, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82635784, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.27929688, + "step": 5144, + "time_per_iteration": 2.6225786209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062804, + "balance_loss_mlp": 1.03600597, + "epoch": 0.9898037706810312, + "flos": 567070654464.0, + "grad_norm": 0.06173682560666289, + "language_loss": 0.80544829, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81607634, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.26843262, + "step": 5145, + "time_per_iteration": 2.654784917831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007962, + "balance_loss_mlp": 0.99680388, + "epoch": 0.9899961523662947, + "flos": 1549476131328.0, + "grad_norm": 0.004486567151540307, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79154384, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.11181641, + "step": 5146, + "time_per_iteration": 4.91846489906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069812, + "balance_loss_mlp": 1.04148769, + "epoch": 0.9901885340515583, + "flos": 610401457152.0, + "grad_norm": 0.05716126832378814, + "language_loss": 0.85056078, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86125898, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.28344727, + "step": 5147, + "time_per_iteration": 2.931447982788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065851, + "balance_loss_mlp": 1.03790796, + "epoch": 0.9903809157368219, + "flos": 517231678464.0, + "grad_norm": 0.0532065355074075, + "language_loss": 0.83003807, + "learning_rate": 2.426269020866512e-07, + "loss": 0.84069657, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.27978516, + "step": 5148, + "time_per_iteration": 2.582853317260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067847, + "balance_loss_mlp": 1.04069078, + "epoch": 0.9905732974220854, + "flos": 1099985716224.0, + "grad_norm": 0.061881592272325446, + "language_loss": 0.8030684, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81374693, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.27197266, + "step": 5149, + "time_per_iteration": 3.4220290184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106185, + "balance_loss_mlp": 1.03354931, + "epoch": 0.990765679107349, + "flos": 857630688768.0, + "grad_norm": 0.06728374369522626, + "language_loss": 0.84112859, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.8517471, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.28271484, + "step": 5150, + "time_per_iteration": 3.148772716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066189, + "balance_loss_mlp": 1.03838968, + "epoch": 0.9909580607926125, + "flos": 491041986048.0, + "grad_norm": 0.07292631958649022, + "language_loss": 0.79760653, + "learning_rate": 2.143871490925542e-07, + "loss": 0.80826843, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.27856445, + "step": 5151, + "time_per_iteration": 2.616525888442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062289, + "balance_loss_mlp": 1.03401303, + "epoch": 0.9911504424778761, + "flos": 584786525184.0, + "grad_norm": 0.054098688428558285, + "language_loss": 0.79339308, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80401593, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.28271484, + "step": 5152, + "time_per_iteration": 2.697601318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063445, + "balance_loss_mlp": 1.03431082, + "epoch": 0.9913428241631397, + "flos": 569763832320.0, + "grad_norm": 0.05798999078782896, + "language_loss": 0.81267428, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82330877, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.29125977, + "step": 5153, + "time_per_iteration": 2.695401430130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069116, + "balance_loss_mlp": 1.04124546, + "epoch": 0.9915352058484033, + "flos": 489505531392.0, + "grad_norm": 0.05056334219694486, + "language_loss": 0.86464977, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87534094, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.27880859, + "step": 5154, + "time_per_iteration": 2.605447292327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.03827786, + "epoch": 0.9917275875336667, + "flos": 743708445696.0, + "grad_norm": 0.05862021077509536, + "language_loss": 0.82721972, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83788049, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.27832031, + "step": 5155, + "time_per_iteration": 2.9818990230560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063222, + "balance_loss_mlp": 1.03580356, + "epoch": 0.9919199692189303, + "flos": 508009978368.0, + "grad_norm": 0.06232653880915019, + "language_loss": 0.80101055, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81164277, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.27441406, + "step": 5156, + "time_per_iteration": 2.7161529064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065636, + "balance_loss_mlp": 1.03790784, + "epoch": 0.9921123509041939, + "flos": 543702177792.0, + "grad_norm": 0.05760646667889777, + "language_loss": 0.83967817, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85033458, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.27758789, + "step": 5157, + "time_per_iteration": 2.682803153991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067348, + "balance_loss_mlp": 1.03947723, + "epoch": 0.9923047325894575, + "flos": 671263428096.0, + "grad_norm": 0.06367529404568006, + "language_loss": 0.77228302, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78295648, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.27905273, + "step": 5158, + "time_per_iteration": 4.246589660644531 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065306, + "balance_loss_mlp": 1.03667164, + "epoch": 0.9924971142747211, + "flos": 466291233792.0, + "grad_norm": 0.05221936128324597, + "language_loss": 0.80749053, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81814361, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.28637695, + "step": 5159, + "time_per_iteration": 2.6864054203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067543, + "balance_loss_mlp": 1.03955257, + "epoch": 0.9926894959599846, + "flos": 491337349632.0, + "grad_norm": 0.05537471302604842, + "language_loss": 0.82565844, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83633387, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.2800293, + "step": 5160, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067776, + "balance_loss_mlp": 1.04052472, + "epoch": 0.9928818776452482, + "flos": 492144864768.0, + "grad_norm": 0.060411904501977205, + "language_loss": 0.81547213, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82614988, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.27294922, + "step": 5161, + "time_per_iteration": 2.583430290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066178, + "balance_loss_mlp": 1.03759193, + "epoch": 0.9930742593305117, + "flos": 546081053184.0, + "grad_norm": 0.05578659951307412, + "language_loss": 0.84225255, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85291433, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.28588867, + "step": 5162, + "time_per_iteration": 2.784080743789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065962, + "balance_loss_mlp": 1.0385437, + "epoch": 0.9932666410157753, + "flos": 585234657792.0, + "grad_norm": 0.06485983604948299, + "language_loss": 0.85919869, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86985826, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.27441406, + "step": 5163, + "time_per_iteration": 2.8364484310150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068157, + "balance_loss_mlp": 1.04088211, + "epoch": 0.9934590227010388, + "flos": 536833211904.0, + "grad_norm": 0.05493620654926138, + "language_loss": 0.83572662, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84640813, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.27319336, + "step": 5164, + "time_per_iteration": 2.6437575817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067967, + "balance_loss_mlp": 1.03973818, + "epoch": 0.9936514043863024, + "flos": 517754004480.0, + "grad_norm": 0.056425868204336455, + "language_loss": 0.86519146, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87587112, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.2824707, + "step": 5165, + "time_per_iteration": 2.6676025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065235, + "balance_loss_mlp": 1.0375309, + "epoch": 0.993843786071566, + "flos": 744284616192.0, + "grad_norm": 0.0490013156900056, + "language_loss": 0.80073357, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81138593, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.27734375, + "step": 5166, + "time_per_iteration": 3.0390608310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106152, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9940361677568296, + "flos": 525647273472.0, + "grad_norm": 0.06372482687070874, + "language_loss": 0.81947267, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83008784, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.27441406, + "step": 5167, + "time_per_iteration": 2.7436130046844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106557, + "balance_loss_mlp": 1.03769922, + "epoch": 0.9942285494420932, + "flos": 555376946688.0, + "grad_norm": 0.058129004246180074, + "language_loss": 0.79914057, + "learning_rate": 8.735020633177104e-08, + "loss": 0.80979621, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.27880859, + "step": 5168, + "time_per_iteration": 2.7534189224243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061589, + "balance_loss_mlp": 1.03426659, + "epoch": 0.9944209311273566, + "flos": 585722078208.0, + "grad_norm": 0.055875518940578246, + "language_loss": 0.82051367, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83112955, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.2734375, + "step": 5169, + "time_per_iteration": 2.6998350620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.03873193, + "epoch": 0.9946133128126202, + "flos": 735185161728.0, + "grad_norm": 0.0553680742338806, + "language_loss": 0.81735945, + "learning_rate": 7.609202086272804e-08, + "loss": 0.82802856, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.28173828, + "step": 5170, + "time_per_iteration": 2.974087953567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067505, + "balance_loss_mlp": 1.04006255, + "epoch": 0.9948056944978838, + "flos": 645728481792.0, + "grad_norm": 0.05803029199457052, + "language_loss": 0.82077813, + "learning_rate": 7.075405856526995e-08, + "loss": 0.8314532, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.2746582, + "step": 5171, + "time_per_iteration": 2.802490711212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063573, + "balance_loss_mlp": 1.03567767, + "epoch": 0.9949980761831474, + "flos": 445610142720.0, + "grad_norm": 0.051666066315458954, + "language_loss": 0.8596555, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87029123, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.27929688, + "step": 5172, + "time_per_iteration": 2.5076162815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065596, + "balance_loss_mlp": 1.03736687, + "epoch": 0.995190457868411, + "flos": 435407809536.0, + "grad_norm": 0.05785345995526832, + "language_loss": 0.85552263, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86617857, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.28198242, + "step": 5173, + "time_per_iteration": 2.560850143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065028, + "balance_loss_mlp": 1.03730011, + "epoch": 0.9953828395536745, + "flos": 513937598976.0, + "grad_norm": 0.06879105592605711, + "language_loss": 0.81177318, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82242346, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.27783203, + "step": 5174, + "time_per_iteration": 2.5707099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068403, + "balance_loss_mlp": 1.04010248, + "epoch": 0.995575221238938, + "flos": 479608736256.0, + "grad_norm": 0.05793677097308627, + "language_loss": 0.81645823, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82714224, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.28295898, + "step": 5175, + "time_per_iteration": 2.5351579189300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.03816414, + "epoch": 0.9957676029242016, + "flos": 587500051968.0, + "grad_norm": 0.06414761636339157, + "language_loss": 0.76212519, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77278793, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.28125, + "step": 5176, + "time_per_iteration": 2.732705593109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067516, + "balance_loss_mlp": 1.03833389, + "epoch": 0.9959599846094652, + "flos": 426244336128.0, + "grad_norm": 0.06423852980167365, + "language_loss": 0.80285561, + "learning_rate": 4.280223671243588e-08, + "loss": 0.8135308, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.29150391, + "step": 5177, + "time_per_iteration": 2.4773876667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_mlp": 1.03843391, + "epoch": 0.9961523662947287, + "flos": 611312279040.0, + "grad_norm": 0.06073091502053508, + "language_loss": 0.80718446, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81785315, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.28442383, + "step": 5178, + "time_per_iteration": 2.8103115558624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106861, + "balance_loss_mlp": 1.04052424, + "epoch": 0.9963447479799923, + "flos": 550521681408.0, + "grad_norm": 0.06916799430968669, + "language_loss": 0.73766887, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74835497, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.28076172, + "step": 5179, + "time_per_iteration": 2.703822135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_mlp": 1.03933787, + "epoch": 0.9965371296652559, + "flos": 625590065664.0, + "grad_norm": 0.06211677936352167, + "language_loss": 0.8883329, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89901328, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.28710938, + "step": 5180, + "time_per_iteration": 2.718750238418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.03987575, + "epoch": 0.9967295113505195, + "flos": 639205751808.0, + "grad_norm": 0.0597666169713144, + "language_loss": 0.81612909, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82680583, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.27807617, + "step": 5181, + "time_per_iteration": 2.8592679500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_mlp": 1.03949463, + "epoch": 0.996921893035783, + "flos": 607101585408.0, + "grad_norm": 0.06266480255691725, + "language_loss": 0.76737624, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77805281, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.28149414, + "step": 5182, + "time_per_iteration": 2.7209362983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03768075, + "epoch": 0.9971142747210465, + "flos": 643867550208.0, + "grad_norm": 0.06216190234164215, + "language_loss": 0.81831425, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82897472, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.28393555, + "step": 5183, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064034, + "balance_loss_mlp": 1.03599548, + "epoch": 0.9973066564063101, + "flos": 1040353251840.0, + "grad_norm": 0.06556543117014918, + "language_loss": 0.81132638, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82196677, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.28076172, + "step": 5184, + "time_per_iteration": 3.3546173572540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.04109859, + "epoch": 0.9974990380915737, + "flos": 664784368128.0, + "grad_norm": 0.05954721572785247, + "language_loss": 0.82821018, + "learning_rate": 1.640281555587153e-08, + "loss": 0.83889651, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.27539062, + "step": 5185, + "time_per_iteration": 2.8433001041412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106356, + "balance_loss_mlp": 1.03559387, + "epoch": 0.9976914197768373, + "flos": 717808324608.0, + "grad_norm": 0.06389741277259232, + "language_loss": 0.77631515, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78695071, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.2800293, + "step": 5186, + "time_per_iteration": 2.853642225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106937, + "balance_loss_mlp": 1.04040217, + "epoch": 0.9978838014621008, + "flos": 518078481408.0, + "grad_norm": 0.06367119653517782, + "language_loss": 0.78993869, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80063242, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.28979492, + "step": 5187, + "time_per_iteration": 2.597912073135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063481, + "balance_loss_mlp": 1.03594351, + "epoch": 0.9980761831473643, + "flos": 603138203136.0, + "grad_norm": 0.05419942237508798, + "language_loss": 0.84304327, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85367805, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.27587891, + "step": 5188, + "time_per_iteration": 2.840768337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065682, + "balance_loss_mlp": 1.03759623, + "epoch": 0.9982685648326279, + "flos": 453291005952.0, + "grad_norm": 0.06002889752243125, + "language_loss": 0.89368796, + "learning_rate": 7.861726879943021e-09, + "loss": 0.9043448, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.28125, + "step": 5189, + "time_per_iteration": 2.552727222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03518605, + "epoch": 0.9984609465178915, + "flos": 481165539840.0, + "grad_norm": 0.06316155939206874, + "language_loss": 0.7862134, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79684329, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.2779541, + "step": 5190, + "time_per_iteration": 2.6379244327545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066899, + "balance_loss_mlp": 1.03917098, + "epoch": 0.9986533282031551, + "flos": 476675449344.0, + "grad_norm": 0.05665801971866078, + "language_loss": 0.83808017, + "learning_rate": 4.755864394301312e-09, + "loss": 0.84874916, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.27734375, + "step": 5191, + "time_per_iteration": 2.6630475521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_mlp": 1.04109824, + "epoch": 0.9988457098884186, + "flos": 641647236096.0, + "grad_norm": 0.060865154735589906, + "language_loss": 0.86545348, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87614989, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.28540039, + "step": 5192, + "time_per_iteration": 2.8011183738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_mlp": 1.03661728, + "epoch": 0.9990380915736822, + "flos": 396105818112.0, + "grad_norm": 0.06899373764772466, + "language_loss": 0.88023686, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89088672, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.28417969, + "step": 5193, + "time_per_iteration": 2.444704532623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063986, + "balance_loss_mlp": 1.0359, + "epoch": 0.9992304732589458, + "flos": 575831075328.0, + "grad_norm": 0.05722056537854718, + "language_loss": 0.84702891, + "learning_rate": 1.552936970405927e-09, + "loss": 0.85766876, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.28100586, + "step": 5194, + "time_per_iteration": 2.7448079586029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_mlp": 1.03680563, + "epoch": 0.9994228549442093, + "flos": 544017890304.0, + "grad_norm": 0.06097238370641317, + "language_loss": 0.75467938, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76532328, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.27636719, + "step": 5195, + "time_per_iteration": 2.657932996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065274, + "balance_loss_mlp": 1.03723598, + "epoch": 0.9996152366294728, + "flos": 1470777910272.0, + "grad_norm": 0.07343124053049398, + "language_loss": 0.80373323, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81438601, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.28051758, + "step": 5196, + "time_per_iteration": 3.7275376319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052349, + "balance_loss_mlp": 1.02769601, + "epoch": 0.9998076183147364, + "flos": 618667255296.0, + "grad_norm": 0.1051352637453268, + "language_loss": 0.69885886, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70938236, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.24664307, + "step": 5197, + "time_per_iteration": 4.0517966747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_mlp": 1.01130903, + "epoch": 1.0, + "flos": 1289959492608.0, + "grad_norm": 0.02771641462610759, + "language_loss": 0.84174764, + "learning_rate": 0.0, + "loss": 0.85203409, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.17364502, + "step": 5198, + "time_per_iteration": 5.574992895126343 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1713320035811328e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/training_args.bin b/sft_pretrain/Full_smoe_share/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992 diff --git a/sft_pretrain/Full_smoe_share/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_smoe_share/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe_share/config.json b/sft_pretrain/Full_smoe_share/config.json new file mode 100644 index 0000000000000000000000000000000000000000..778bd73ae4d3d4cc48e23fabca3003593286ecbd --- /dev/null +++ b/sft_pretrain/Full_smoe_share/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_share", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe_share/generation_config.json b/sft_pretrain/Full_smoe_share/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe_share/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe_share/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe_share/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe_share/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e2d0c762bdd31468ec17feac5bdd62d38e82ad7 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4a3130c658203a17c37b38def7719edce7c1fef2b626c71523c71c342ff486 +size 3759020544 diff --git a/sft_pretrain/Full_smoe_share/model.safetensors.index.json b/sft_pretrain/Full_smoe_share/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..90a56cf0153ed9062ff35ee2b372f7ab9e796c6a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731420128 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe_share/special_tokens_map.json b/sft_pretrain/Full_smoe_share/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe_share/tokenizer.model b/sft_pretrain/Full_smoe_share/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe_share/tokenizer_config.json b/sft_pretrain/Full_smoe_share/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe_share/trainer_state.json b/sft_pretrain/Full_smoe_share/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6fd7709e022dd684a4b8204c27d6adb1815921a --- /dev/null +++ b/sft_pretrain/Full_smoe_share/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03958175, + "balance_loss_mlp": 3.00755191, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 28.914608756113072, + "language_loss": 3.87018156, + "learning_rate": 0.0, + "loss": 2.58113432, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 23.802019834518433 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915335, + "balance_loss_mlp": 1.25005209, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 4.8593923560988435, + "language_loss": 2.35405588, + "learning_rate": 0.00013726078121135892, + "loss": 2.37320924, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.65625, + "step": 2, + "time_per_iteration": 2.8532886505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920846, + "balance_loss_mlp": 1.25708926, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 3.0028031994213777, + "language_loss": 1.96315837, + "learning_rate": 0.00021755319103969496, + "loss": 1.9823668, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.640625, + "step": 3, + "time_per_iteration": 2.841437339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01900548, + "balance_loss_mlp": 1.26196778, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.731178632358193, + "language_loss": 1.51703906, + "learning_rate": 0.00027452156242271784, + "loss": 1.53604448, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.3828125, + "step": 4, + "time_per_iteration": 2.7456114292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01846218, + "balance_loss_mlp": 1.25188851, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 2.5417144067747603, + "language_loss": 1.52625787, + "learning_rate": 0.0003187096642208417, + "loss": 1.54472005, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 5.93359375, + "step": 5, + "time_per_iteration": 2.6199026107788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0183984, + "balance_loss_mlp": 1.27068734, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.334824335042464, + "language_loss": 1.40782702, + "learning_rate": 0.0003548139722510539, + "loss": 1.42622542, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 5.69921875, + "step": 6, + "time_per_iteration": 2.747270107269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0199186, + "balance_loss_mlp": 1.44254375, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 1.092177996343933, + "language_loss": 1.36706996, + "learning_rate": 0.00038533972973918044, + "loss": 1.38698864, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.5, + "step": 7, + "time_per_iteration": 2.6748878955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02260733, + "balance_loss_mlp": 1.72209811, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.8384078813871362, + "language_loss": 1.30779457, + "learning_rate": 0.0004117823436340768, + "loss": 1.3304019, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.390625, + "step": 8, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02549259, + "balance_loss_mlp": 2.01024222, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.9225645938984937, + "language_loss": 1.40127456, + "learning_rate": 0.00043510638207938993, + "loss": 1.42676711, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.39453125, + "step": 9, + "time_per_iteration": 2.8516194820404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02769124, + "balance_loss_mlp": 2.22057033, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 2.3673640139094667, + "language_loss": 1.25222194, + "learning_rate": 0.00045597044543220066, + "loss": 1.27991319, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.4921875, + "step": 10, + "time_per_iteration": 2.6775431632995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02889683, + "balance_loss_mlp": 2.31366348, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 3.9279002976271125, + "language_loss": 1.24874163, + "learning_rate": 0.00047484428652143135, + "loss": 1.27763844, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 5.765625, + "step": 11, + "time_per_iteration": 2.978304386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0309849, + "balance_loss_mlp": 2.49538684, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 1.4997276509751025, + "language_loss": 1.30425894, + "learning_rate": 0.0004920747534624128, + "loss": 1.33524382, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 6.01953125, + "step": 12, + "time_per_iteration": 2.660757064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0325611, + "balance_loss_mlp": 2.63698483, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.27573519674031227, + "language_loss": 1.29333067, + "learning_rate": 0.0005079252465375872, + "loss": 1.32589173, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 6.1875, + "step": 13, + "time_per_iteration": 2.905634880065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03517619, + "balance_loss_mlp": 2.87789392, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.5949349515444387, + "language_loss": 1.16881835, + "learning_rate": 0.0005226005109505393, + "loss": 1.20399451, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 6.39453125, + "step": 14, + "time_per_iteration": 2.6116466522216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03647219, + "balance_loss_mlp": 2.99872088, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.7718254129229014, + "language_loss": 1.22867727, + "learning_rate": 0.0005362628552605367, + "loss": 1.26514947, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 6.484375, + "step": 15, + "time_per_iteration": 2.80147123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03485084, + "balance_loss_mlp": 2.81407928, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.7401604798059911, + "language_loss": 1.27103257, + "learning_rate": 0.0005490431248454357, + "loss": 1.30588341, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 6.71484375, + "step": 16, + "time_per_iteration": 2.72638201713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03091961, + "balance_loss_mlp": 2.46329856, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.30683115050750837, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78797078, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 6.28125, + "step": 17, + "time_per_iteration": 6.094223260879517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03189654, + "balance_loss_mlp": 2.50453377, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.3045463524910074, + "language_loss": 1.13145232, + "learning_rate": 0.0005723671632907488, + "loss": 1.16334891, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 6.859375, + "step": 18, + "time_per_iteration": 2.6759910583496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03092663, + "balance_loss_mlp": 2.39648056, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.23602477180386344, + "language_loss": 1.18155861, + "learning_rate": 0.0005830738490244919, + "loss": 1.21248519, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.505410671234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03039888, + "balance_loss_mlp": 2.32653999, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.24009706761990102, + "language_loss": 1.19359791, + "learning_rate": 0.0005932312266435596, + "loss": 1.22399676, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 7.12890625, + "step": 20, + "time_per_iteration": 2.78657603263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03040938, + "balance_loss_mlp": 2.32339382, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.17079239690828452, + "language_loss": 1.14516783, + "learning_rate": 0.0006028929207788754, + "loss": 1.17557728, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 7.171875, + "step": 21, + "time_per_iteration": 2.7249202728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03095818, + "balance_loss_mlp": 2.35843754, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.14242736472953105, + "language_loss": 1.17636526, + "learning_rate": 0.0006121050677327902, + "loss": 1.20732355, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 7.3671875, + "step": 22, + "time_per_iteration": 2.8816165924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03158898, + "balance_loss_mlp": 2.41388798, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.2087285570273359, + "language_loss": 1.07450879, + "learning_rate": 0.0006209076479463684, + "loss": 1.10609782, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 7.44140625, + "step": 23, + "time_per_iteration": 2.6234865188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03183939, + "balance_loss_mlp": 2.43282533, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.1648031444861348, + "language_loss": 1.17208815, + "learning_rate": 0.0006293355346737718, + "loss": 1.20392752, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 7.50390625, + "step": 24, + "time_per_iteration": 2.6747982501983643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03230874, + "balance_loss_mlp": 2.47976065, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.19727819873357916, + "language_loss": 1.13454294, + "learning_rate": 0.0006374193284416834, + "loss": 1.16685176, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 7.5078125, + "step": 25, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0329228, + "balance_loss_mlp": 2.5568068, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.1350276315355779, + "language_loss": 1.11706781, + "learning_rate": 0.0006451860277489461, + "loss": 1.14999056, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 7.34765625, + "step": 26, + "time_per_iteration": 2.595344305038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03271905, + "balance_loss_mlp": 2.55016398, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.16347516382600882, + "language_loss": 1.19968891, + "learning_rate": 0.0006526595731190848, + "loss": 1.23240781, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 7.21484375, + "step": 27, + "time_per_iteration": 2.4664127826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03288089, + "balance_loss_mlp": 2.59610367, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.1428829159478278, + "language_loss": 1.13108253, + "learning_rate": 0.0006598612921618983, + "loss": 1.16396332, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 6.92578125, + "step": 28, + "time_per_iteration": 2.804295778274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03294075, + "balance_loss_mlp": 2.62612176, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.20851883498814452, + "language_loss": 1.0600431, + "learning_rate": 0.0006668102665011454, + "loss": 1.09298372, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 6.68359375, + "step": 29, + "time_per_iteration": 3.255702495574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03199031, + "balance_loss_mlp": 2.59096837, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.2979528071454863, + "language_loss": 1.15479767, + "learning_rate": 0.0006735236364718957, + "loss": 1.18678796, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 6.0703125, + "step": 30, + "time_per_iteration": 2.7074596881866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03173184, + "balance_loss_mlp": 2.61356831, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.19339065750569648, + "language_loss": 1.13838637, + "learning_rate": 0.0006800168558381346, + "loss": 1.17011821, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 5.60546875, + "step": 31, + "time_per_iteration": 2.6867663860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03044372, + "balance_loss_mlp": 2.54197669, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.19192711986346297, + "language_loss": 1.17224455, + "learning_rate": 0.0006863039060567947, + "loss": 1.20268822, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 5.01953125, + "step": 32, + "time_per_iteration": 2.7029900550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02954172, + "balance_loss_mlp": 2.48954153, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.18120318877382763, + "language_loss": 1.09236336, + "learning_rate": 0.0006923974775611263, + "loss": 1.12190521, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 4.640625, + "step": 33, + "time_per_iteration": 2.7966651916503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02822322, + "balance_loss_mlp": 2.40728283, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.145871801521796, + "language_loss": 1.05915022, + "learning_rate": 0.0006983091239737814, + "loss": 1.0873735, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 4.15625, + "step": 34, + "time_per_iteration": 2.9987330436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02690136, + "balance_loss_mlp": 2.31496024, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.3134152992972928, + "language_loss": 1.04935622, + "learning_rate": 0.0007040493939600222, + "loss": 1.07625759, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 3.75, + "step": 35, + "time_per_iteration": 2.8552193641662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02568493, + "balance_loss_mlp": 2.22154617, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.17701612022333574, + "language_loss": 1.05792356, + "learning_rate": 0.0007096279445021078, + "loss": 1.08360851, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 3.47070312, + "step": 36, + "time_per_iteration": 2.7224435806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02489254, + "balance_loss_mlp": 2.16557646, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.13856321956275922, + "language_loss": 1.12953377, + "learning_rate": 0.0007150536386503726, + "loss": 1.15442634, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 3.23632812, + "step": 37, + "time_per_iteration": 2.868824005126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02371099, + "balance_loss_mlp": 2.08385229, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.1045684718913455, + "language_loss": 1.04885924, + "learning_rate": 0.0007203346302358509, + "loss": 1.0725702, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 2.87304688, + "step": 38, + "time_per_iteration": 2.9964613914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.022844, + "balance_loss_mlp": 2.01431966, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.11457879899925279, + "language_loss": 1.09371829, + "learning_rate": 0.000725478437577282, + "loss": 1.11656225, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 2.703125, + "step": 39, + "time_per_iteration": 2.7697911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02209938, + "balance_loss_mlp": 1.9577868, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.09741634912607965, + "language_loss": 1.05106318, + "learning_rate": 0.0007304920078549186, + "loss": 1.07316256, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 2.51953125, + "step": 40, + "time_per_iteration": 2.6858811378479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02127988, + "balance_loss_mlp": 1.89738917, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.1027173821952558, + "language_loss": 1.0668, + "learning_rate": 0.0007353817735343603, + "loss": 1.08807993, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 2.30273438, + "step": 41, + "time_per_iteration": 2.7466464042663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0203117, + "balance_loss_mlp": 1.82136178, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.13433083641106106, + "language_loss": 1.02085233, + "learning_rate": 0.0007401537019902344, + "loss": 1.04116416, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 2.10058594, + "step": 42, + "time_per_iteration": 2.6472368240356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955875, + "balance_loss_mlp": 1.77000403, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.1211736659455407, + "language_loss": 1.05737603, + "learning_rate": 0.0007448133392900729, + "loss": 1.07693481, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.85742188, + "step": 43, + "time_per_iteration": 2.716550588607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01955604, + "balance_loss_mlp": 1.78737581, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.16872872054008078, + "language_loss": 1.01187599, + "learning_rate": 0.0007493658489441491, + "loss": 1.03143215, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.68261719, + "step": 44, + "time_per_iteration": 2.875014066696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01891991, + "balance_loss_mlp": 1.7426461, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.13908928982797317, + "language_loss": 1.04866791, + "learning_rate": 0.0007538160463002316, + "loss": 1.06758785, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.4921875, + "step": 45, + "time_per_iteration": 2.6579010486602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01770341, + "balance_loss_mlp": 1.64674437, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.10189568444589565, + "language_loss": 1.07831812, + "learning_rate": 0.0007581684291577274, + "loss": 1.09602141, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.234375, + "step": 46, + "time_per_iteration": 2.640967845916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01721967, + "balance_loss_mlp": 1.61086416, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.13316435244960997, + "language_loss": 1.10805786, + "learning_rate": 0.0007624272050891776, + "loss": 1.12527752, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 1.11230469, + "step": 47, + "time_per_iteration": 2.8335459232330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578117, + "balance_loss_mlp": 1.4876132, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.11283146306838601, + "language_loss": 1.0112282, + "learning_rate": 0.0007665963158851307, + "loss": 1.02700949, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.90478516, + "step": 48, + "time_per_iteration": 2.8267853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494271, + "balance_loss_mlp": 1.41659403, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.11438710989386189, + "language_loss": 1.09804726, + "learning_rate": 0.0007706794594783609, + "loss": 1.11299002, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.77587891, + "step": 49, + "time_per_iteration": 2.767359495162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01450716, + "balance_loss_mlp": 1.37876153, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.12814906604020712, + "language_loss": 1.08643568, + "learning_rate": 0.0007746801096530423, + "loss": 1.10094285, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.71972656, + "step": 50, + "time_per_iteration": 2.8213155269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0143922, + "balance_loss_mlp": 1.37599134, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.19317362931311696, + "language_loss": 1.13336241, + "learning_rate": 0.0007786015338021173, + "loss": 1.14775467, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.63183594, + "step": 51, + "time_per_iteration": 2.670414924621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01421394, + "balance_loss_mlp": 1.36116982, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.10636608126159033, + "language_loss": 1.06046486, + "learning_rate": 0.0007824468089603051, + "loss": 1.0746789, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.60205078, + "step": 52, + "time_per_iteration": 2.650749683380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01398771, + "balance_loss_mlp": 1.34627175, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.08734537144859746, + "language_loss": 1.05057502, + "learning_rate": 0.0007862188363098669, + "loss": 1.0645628, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52587891, + "step": 53, + "time_per_iteration": 3.1914114952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339892, + "balance_loss_mlp": 1.29123116, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.12892942806844523, + "language_loss": 1.05977488, + "learning_rate": 0.0007899203543304438, + "loss": 1.07317376, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48608398, + "step": 54, + "time_per_iteration": 2.7370150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129116, + "balance_loss_mlp": 1.24609876, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.10351520483586135, + "language_loss": 1.19524932, + "learning_rate": 0.0007935539507422731, + "loss": 1.20816088, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.45068359, + "step": 55, + "time_per_iteration": 2.5938258171081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241218, + "balance_loss_mlp": 1.19842196, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.14579553174668378, + "language_loss": 1.11398613, + "learning_rate": 0.0007971220733732573, + "loss": 1.12639832, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42822266, + "step": 56, + "time_per_iteration": 2.69441556930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214647, + "balance_loss_mlp": 1.1754272, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08690334212617827, + "language_loss": 1.05753016, + "learning_rate": 0.0008006270400641869, + "loss": 1.06967664, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.39208984, + "step": 57, + "time_per_iteration": 2.72200345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172174, + "balance_loss_mlp": 1.13638771, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.1589230608581115, + "language_loss": 1.07195449, + "learning_rate": 0.0008040710477125043, + "loss": 1.08367622, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.35791016, + "step": 58, + "time_per_iteration": 2.7268636226654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116856, + "balance_loss_mlp": 1.13193893, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.10215076611006164, + "language_loss": 1.07557666, + "learning_rate": 0.0008074561805429771, + "loss": 1.08726227, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.36645508, + "step": 59, + "time_per_iteration": 2.6336522102355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116508, + "balance_loss_mlp": 1.13067603, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.1141641229712409, + "language_loss": 1.06040812, + "learning_rate": 0.0008107844176832545, + "loss": 1.07205892, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.34399414, + "step": 60, + "time_per_iteration": 2.6922121047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181661, + "balance_loss_mlp": 1.14883125, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.13546354224487772, + "language_loss": 1.07509732, + "learning_rate": 0.0008140576401132568, + "loss": 1.08691382, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.32836914, + "step": 61, + "time_per_iteration": 2.632707357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183098, + "balance_loss_mlp": 1.15415382, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.21921646489667587, + "language_loss": 1.08552384, + "learning_rate": 0.0008172776370494935, + "loss": 1.09735489, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.28955078, + "step": 62, + "time_per_iteration": 2.736295700073242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169478, + "balance_loss_mlp": 1.14103436, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.08851801033761798, + "language_loss": 1.15278125, + "learning_rate": 0.0008204461118185703, + "loss": 1.16447616, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.28417969, + "step": 63, + "time_per_iteration": 2.6189370155334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164028, + "balance_loss_mlp": 1.13801682, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.09949063345381139, + "language_loss": 1.0443747, + "learning_rate": 0.0008235646872681536, + "loss": 1.05601501, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.26025391, + "step": 64, + "time_per_iteration": 2.5901291370391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163729, + "balance_loss_mlp": 1.13857555, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.13431360680602436, + "language_loss": 1.04092753, + "learning_rate": 0.0008266349107584288, + "loss": 1.05256474, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.25146484, + "step": 65, + "time_per_iteration": 2.6860554218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162931, + "balance_loss_mlp": 1.13891053, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.1102068865315058, + "language_loss": 1.07257366, + "learning_rate": 0.0008296582587724851, + "loss": 1.08420289, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.24023438, + "step": 66, + "time_per_iteration": 2.7269198894500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160507, + "balance_loss_mlp": 1.1370945, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.08100484164865049, + "language_loss": 1.05156851, + "learning_rate": 0.0008326361411800136, + "loss": 1.06317365, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.23400879, + "step": 67, + "time_per_iteration": 2.984511613845825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164469, + "balance_loss_mlp": 1.14209354, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.7331609098323609, + "language_loss": 1.05716372, + "learning_rate": 0.0008355699051851403, + "loss": 1.06880832, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.22363281, + "step": 68, + "time_per_iteration": 2.7606749534606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236513, + "balance_loss_mlp": 1.21256447, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.09768789722348739, + "language_loss": 1.12206995, + "learning_rate": 0.0008384608389860635, + "loss": 1.13443518, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.23950195, + "step": 69, + "time_per_iteration": 2.687361001968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308666, + "balance_loss_mlp": 1.28513408, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.20600635395561306, + "language_loss": 1.02831006, + "learning_rate": 0.000841310175171381, + "loss": 1.04139686, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.23510742, + "step": 70, + "time_per_iteration": 2.5935816764831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326501, + "balance_loss_mlp": 1.30259991, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.21749814226597305, + "language_loss": 1.00826097, + "learning_rate": 0.000844119093875517, + "loss": 1.0215261, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2388916, + "step": 71, + "time_per_iteration": 2.706749439239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327473, + "balance_loss_mlp": 1.30280876, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.15663283615990556, + "language_loss": 1.06174731, + "learning_rate": 0.0008468887257134666, + "loss": 1.0750221, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.24682617, + "step": 72, + "time_per_iteration": 2.6893503665924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01307936, + "balance_loss_mlp": 1.28290248, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.165113983041647, + "language_loss": 1.08480573, + "learning_rate": 0.0008496201545131264, + "loss": 1.09788513, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.25012207, + "step": 73, + "time_per_iteration": 2.722555637359619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228575, + "balance_loss_mlp": 1.20456624, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.08819174949442792, + "language_loss": 1.05711758, + "learning_rate": 0.0008523144198617317, + "loss": 1.06940317, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.2401123, + "step": 74, + "time_per_iteration": 3.1970512866973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197377, + "balance_loss_mlp": 1.17341638, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.4509181854760719, + "language_loss": 1.05384588, + "learning_rate": 0.0008549725194813783, + "loss": 1.06581974, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.23962402, + "step": 75, + "time_per_iteration": 2.6595916748046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153799, + "balance_loss_mlp": 1.13126826, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.13717241934186405, + "language_loss": 1.0561651, + "learning_rate": 0.0008575954114472099, + "loss": 1.06770301, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.2253418, + "step": 76, + "time_per_iteration": 3.126678943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146131, + "balance_loss_mlp": 1.12299228, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.24880809118993477, + "language_loss": 1.04725742, + "learning_rate": 0.0008601840162606118, + "loss": 1.05871868, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.23132324, + "step": 77, + "time_per_iteration": 3.0479044914245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125538, + "balance_loss_mlp": 1.10244715, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.18599993070264256, + "language_loss": 1.10793126, + "learning_rate": 0.000862739218788641, + "loss": 1.11918664, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.23083496, + "step": 78, + "time_per_iteration": 2.8093104362487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206073, + "balance_loss_mlp": 1.18093228, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.1007392116308827, + "language_loss": 1.07089067, + "learning_rate": 0.0008652618700799138, + "loss": 1.08295143, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.25146484, + "step": 79, + "time_per_iteration": 2.657278060913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312607, + "balance_loss_mlp": 1.28511751, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.10464806869950885, + "language_loss": 1.06340718, + "learning_rate": 0.0008677527890662774, + "loss": 1.07653332, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.27514648, + "step": 80, + "time_per_iteration": 2.541733741760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01403725, + "balance_loss_mlp": 1.37456632, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.15378710965831335, + "language_loss": 1.0758636, + "learning_rate": 0.0008702127641587799, + "loss": 1.08990085, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.29125977, + "step": 81, + "time_per_iteration": 2.6628620624542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01387899, + "balance_loss_mlp": 1.36045754, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.16587297874586884, + "language_loss": 1.02605438, + "learning_rate": 0.0008726425547457192, + "loss": 1.03993344, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.27490234, + "step": 82, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01365036, + "balance_loss_mlp": 1.34021688, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.16158882984955267, + "language_loss": 1.02648211, + "learning_rate": 0.0008750428925998964, + "loss": 1.04013252, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.24829102, + "step": 83, + "time_per_iteration": 2.745786190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321379, + "balance_loss_mlp": 1.29746556, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.12210664974135504, + "language_loss": 1.08113122, + "learning_rate": 0.0008774144832015932, + "loss": 1.09434509, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.23937988, + "step": 84, + "time_per_iteration": 2.695239543914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01701738, + "balance_loss_mlp": 1.6791358, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.2213803749296612, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76476049, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.22558594, + "step": 85, + "time_per_iteration": 4.597177982330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228049, + "balance_loss_mlp": 1.20597172, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.08119704963525505, + "language_loss": 1.03748381, + "learning_rate": 0.0008820741205014318, + "loss": 1.04976428, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.22070312, + "step": 86, + "time_per_iteration": 2.881804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193732, + "balance_loss_mlp": 1.17282319, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.06752942516789381, + "language_loss": 1.04735541, + "learning_rate": 0.0008843634575408404, + "loss": 1.05929279, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20922852, + "step": 87, + "time_per_iteration": 2.681497812271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197059, + "balance_loss_mlp": 1.17523217, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.068849585693396, + "language_loss": 1.06270838, + "learning_rate": 0.0008866266301555082, + "loss": 1.0746789, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.21826172, + "step": 88, + "time_per_iteration": 2.7393336296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188909, + "balance_loss_mlp": 1.16813099, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.11163273932728453, + "language_loss": 1.06937528, + "learning_rate": 0.0008888642296509615, + "loss": 1.08126438, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.20776367, + "step": 89, + "time_per_iteration": 2.5859603881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190142, + "balance_loss_mlp": 1.16919696, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.08151329596812326, + "language_loss": 1.11272717, + "learning_rate": 0.0008910768275115906, + "loss": 1.12462866, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.20947266, + "step": 90, + "time_per_iteration": 2.7672746181488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188461, + "balance_loss_mlp": 1.16750431, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.10059554630111206, + "language_loss": 1.06862557, + "learning_rate": 0.0008932649762767675, + "loss": 1.08051026, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.20947266, + "step": 91, + "time_per_iteration": 2.5685906410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164871, + "balance_loss_mlp": 1.14348471, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.10996439779682221, + "language_loss": 1.10012543, + "learning_rate": 0.0008954292103690864, + "loss": 1.11177421, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.21398926, + "step": 92, + "time_per_iteration": 2.974438428878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164995, + "balance_loss_mlp": 1.14351392, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.07660536936337886, + "language_loss": 1.12072349, + "learning_rate": 0.0008975700468778296, + "loss": 1.13237333, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.21496582, + "step": 93, + "time_per_iteration": 2.5806186199188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162029, + "balance_loss_mlp": 1.14116728, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.0766138268717318, + "language_loss": 1.04864383, + "learning_rate": 0.0008996879863005366, + "loss": 1.06026423, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.20874023, + "step": 94, + "time_per_iteration": 2.6688339710235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153951, + "balance_loss_mlp": 1.13311303, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.05852633811132637, + "language_loss": 1.05006421, + "learning_rate": 0.0009017835132453337, + "loss": 1.06160367, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.20849609, + "step": 95, + "time_per_iteration": 2.5905888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168717, + "balance_loss_mlp": 1.14677107, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.10434292302548942, + "language_loss": 1.05011988, + "learning_rate": 0.0009038570970964896, + "loss": 1.06180692, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.21960449, + "step": 96, + "time_per_iteration": 2.819176197052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143672, + "balance_loss_mlp": 1.12233388, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.06578690538752763, + "language_loss": 1.02219808, + "learning_rate": 0.0009059091926454854, + "loss": 1.0336349, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.21362305, + "step": 97, + "time_per_iteration": 2.6332285404205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128399, + "balance_loss_mlp": 1.10685802, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.06319745463615938, + "language_loss": 1.01510525, + "learning_rate": 0.0009079402406897198, + "loss": 1.02638912, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.2154541, + "step": 98, + "time_per_iteration": 3.231128454208374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115682, + "balance_loss_mlp": 1.09365261, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.08014689887623593, + "language_loss": 1.0309999, + "learning_rate": 0.0009099506686008212, + "loss": 1.0421567, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.22045898, + "step": 99, + "time_per_iteration": 2.7899162769317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.08750439, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.07479046847477189, + "language_loss": 1.06245041, + "learning_rate": 0.0009119408908644013, + "loss": 1.07353711, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.21179199, + "step": 100, + "time_per_iteration": 2.76654314994812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09094632, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.1293510891653682, + "language_loss": 1.11089611, + "learning_rate": 0.0009139113095929519, + "loss": 1.12202048, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.21496582, + "step": 101, + "time_per_iteration": 2.9448165893554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113187, + "balance_loss_mlp": 1.09134769, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.0662757157914564, + "language_loss": 1.05513644, + "learning_rate": 0.0009158623150134762, + "loss": 1.06626844, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.21838379, + "step": 102, + "time_per_iteration": 2.561089277267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132871, + "balance_loss_mlp": 1.11103153, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.12924626158025887, + "language_loss": 1.05462444, + "learning_rate": 0.000917794285931332, + "loss": 1.06595314, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.21850586, + "step": 103, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150426, + "balance_loss_mlp": 1.12918282, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.12259017558591545, + "language_loss": 0.9774698, + "learning_rate": 0.0009197075901716639, + "loss": 0.98897398, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.21264648, + "step": 104, + "time_per_iteration": 2.721444845199585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141997, + "balance_loss_mlp": 1.12036085, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.06848283791602199, + "language_loss": 1.07568073, + "learning_rate": 0.0009216025849997171, + "loss": 1.08710074, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.21655273, + "step": 105, + "time_per_iteration": 2.785515785217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.11743319, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.05548353541402364, + "language_loss": 1.02272427, + "learning_rate": 0.0009234796175212258, + "loss": 1.03411365, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.21520996, + "step": 106, + "time_per_iteration": 2.917363166809082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.10940301, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.08012311925806644, + "language_loss": 1.06108189, + "learning_rate": 0.000925339025064007, + "loss": 1.07239294, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.21691895, + "step": 107, + "time_per_iteration": 2.9934780597686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137249, + "balance_loss_mlp": 1.11515951, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.050481524705402105, + "language_loss": 0.98984301, + "learning_rate": 0.0009271811355418027, + "loss": 1.00121546, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.2208252, + "step": 108, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119308, + "balance_loss_mlp": 1.09725404, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.04498034405706927, + "language_loss": 1.05478954, + "learning_rate": 0.0009290062678013548, + "loss": 1.06598258, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.22058105, + "step": 109, + "time_per_iteration": 2.839287042617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126832, + "balance_loss_mlp": 1.1043849, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.08965534617549129, + "language_loss": 1.03900754, + "learning_rate": 0.0009308147319536321, + "loss": 1.0502758, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.2244873, + "step": 110, + "time_per_iteration": 2.664785385131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127322, + "balance_loss_mlp": 1.10527992, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.07991094573250712, + "language_loss": 1.10446882, + "learning_rate": 0.0009326068296900676, + "loss": 1.11574197, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.22045898, + "step": 111, + "time_per_iteration": 2.826704502105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118777, + "balance_loss_mlp": 1.09644949, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.05764113319631223, + "language_loss": 1.01306438, + "learning_rate": 0.0009343828545846161, + "loss": 1.02425218, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.2232666, + "step": 112, + "time_per_iteration": 2.774557113647461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130616, + "balance_loss_mlp": 1.10844338, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.11711254624088742, + "language_loss": 1.04517794, + "learning_rate": 0.0009361430923823841, + "loss": 1.0564841, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.22192383, + "step": 113, + "time_per_iteration": 2.5728189945220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143235, + "balance_loss_mlp": 1.12140775, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.09177669908726471, + "language_loss": 1.08950138, + "learning_rate": 0.0009378878212755459, + "loss": 1.10093367, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.21826172, + "step": 114, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118923, + "balance_loss_mlp": 1.09746575, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.05600308486582556, + "language_loss": 0.98889154, + "learning_rate": 0.0009396173121672103, + "loss": 1.00008082, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.21472168, + "step": 115, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131221, + "balance_loss_mlp": 1.11031187, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.06813536890625224, + "language_loss": 1.0438683, + "learning_rate": 0.0009413318289238633, + "loss": 1.05518055, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20922852, + "step": 116, + "time_per_iteration": 2.7658987045288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115598, + "balance_loss_mlp": 1.09498656, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.10996119273554948, + "language_loss": 0.97187698, + "learning_rate": 0.0009430316286169771, + "loss": 0.98303294, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20617676, + "step": 117, + "time_per_iteration": 3.027139186859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_mlp": 1.10050249, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.06369887166042827, + "language_loss": 1.02379179, + "learning_rate": 0.0009447169617543361, + "loss": 1.03500056, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.20373535, + "step": 118, + "time_per_iteration": 2.619460344314575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114928, + "balance_loss_mlp": 1.09472179, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.07832492020107534, + "language_loss": 1.08849907, + "learning_rate": 0.0009463880725016029, + "loss": 1.09964836, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.20214844, + "step": 119, + "time_per_iteration": 2.689627170562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108375, + "balance_loss_mlp": 1.08852673, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.05815728344132157, + "language_loss": 1.03645778, + "learning_rate": 0.0009480451988946134, + "loss": 1.0475415, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.19848633, + "step": 120, + "time_per_iteration": 2.8202247619628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111521, + "balance_loss_mlp": 1.09197092, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.09156908943756899, + "language_loss": 1.05033565, + "learning_rate": 0.0009496885730428627, + "loss": 1.06145096, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.1953125, + "step": 121, + "time_per_iteration": 3.060826539993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111873, + "balance_loss_mlp": 1.09195304, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.07227042142752892, + "language_loss": 1.03125668, + "learning_rate": 0.0009513184213246156, + "loss": 1.04237533, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19909668, + "step": 122, + "time_per_iteration": 2.693777322769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116574, + "balance_loss_mlp": 1.09648705, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.10676768106860933, + "language_loss": 1.06918037, + "learning_rate": 0.0009529349645740552, + "loss": 1.08034611, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20080566, + "step": 123, + "time_per_iteration": 2.7788801193237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108243, + "balance_loss_mlp": 1.0888958, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.06448608913203197, + "language_loss": 1.05440235, + "learning_rate": 0.0009545384182608524, + "loss": 1.06548476, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.19335938, + "step": 124, + "time_per_iteration": 2.542592763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125435, + "balance_loss_mlp": 1.10582459, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.07866021425619718, + "language_loss": 1.03027701, + "learning_rate": 0.0009561289926625252, + "loss": 1.04153132, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.19604492, + "step": 125, + "time_per_iteration": 2.790811538696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114447, + "balance_loss_mlp": 1.09582675, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.05023162105608455, + "language_loss": 1.0775013, + "learning_rate": 0.0009577068930299292, + "loss": 1.08864582, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.1862793, + "step": 126, + "time_per_iteration": 2.634744882583618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131099, + "balance_loss_mlp": 1.11309838, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.11313548721486262, + "language_loss": 1.02903807, + "learning_rate": 0.0009592723197462087, + "loss": 1.04034901, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.18017578, + "step": 127, + "time_per_iteration": 2.673091411590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135261, + "balance_loss_mlp": 1.11693859, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.09449576280815732, + "language_loss": 0.99720573, + "learning_rate": 0.0009608254684795125, + "loss": 1.00855827, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.18334961, + "step": 128, + "time_per_iteration": 2.9315080642700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125442, + "balance_loss_mlp": 1.10695267, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.06510984253988934, + "language_loss": 1.02999425, + "learning_rate": 0.0009623665303297678, + "loss": 1.04124868, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.18493652, + "step": 129, + "time_per_iteration": 2.7419071197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109401, + "balance_loss_mlp": 1.09171033, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.11817944884573778, + "language_loss": 1.06827164, + "learning_rate": 0.0009638956919697878, + "loss": 1.07936561, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.17712402, + "step": 130, + "time_per_iteration": 2.898789405822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109182, + "balance_loss_mlp": 1.09152734, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08339763042198223, + "language_loss": 0.98782563, + "learning_rate": 0.0009654131357809714, + "loss": 0.99891746, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.17663574, + "step": 131, + "time_per_iteration": 2.5997226238250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110117, + "balance_loss_mlp": 1.09165168, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.07600036723868295, + "language_loss": 1.07807457, + "learning_rate": 0.0009669190399838441, + "loss": 1.08917582, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.18469238, + "step": 132, + "time_per_iteration": 3.099355459213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123354, + "balance_loss_mlp": 1.10540128, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.1018451896089413, + "language_loss": 1.01215065, + "learning_rate": 0.0009684135787636724, + "loss": 1.02338421, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.17956543, + "step": 133, + "time_per_iteration": 2.8484303951263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110859, + "balance_loss_mlp": 1.09306097, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.0768854449505878, + "language_loss": 1.05274129, + "learning_rate": 0.0009698969223913726, + "loss": 1.06384993, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.17822266, + "step": 134, + "time_per_iteration": 3.0583713054656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099954, + "balance_loss_mlp": 1.08200145, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.06563028697143787, + "language_loss": 1.07862437, + "learning_rate": 0.0009713692373399265, + "loss": 1.08962393, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.1796875, + "step": 135, + "time_per_iteration": 2.6854658126831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01638015, + "balance_loss_mlp": 1.62485397, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.19726256755033653, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81094241, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.13183594, + "step": 136, + "time_per_iteration": 5.296766042709351 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420299, + "balance_loss_mlp": 1.40761507, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.11305854818728235, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.7923134, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.12695312, + "step": 137, + "time_per_iteration": 4.982319355010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113827, + "balance_loss_mlp": 1.12156892, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.17869099152539902, + "language_loss": 1.01327038, + "learning_rate": 0.0009757216201974225, + "loss": 1.02465308, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.16699219, + "step": 138, + "time_per_iteration": 2.8622727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186505, + "balance_loss_mlp": 1.16889763, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08591345057859309, + "language_loss": 1.05914044, + "learning_rate": 0.0009771514130396581, + "loss": 1.07100558, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.17614746, + "step": 139, + "time_per_iteration": 2.67812442779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120454, + "balance_loss_mlp": 1.18700433, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.10724594122721719, + "language_loss": 1.05634308, + "learning_rate": 0.00097857095638274, + "loss": 1.06838858, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.17541504, + "step": 140, + "time_per_iteration": 2.597321033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120509, + "balance_loss_mlp": 1.1880548, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.08882077115516282, + "language_loss": 0.97595245, + "learning_rate": 0.0009799803961288726, + "loss": 0.98800337, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.17053223, + "step": 141, + "time_per_iteration": 3.017937421798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177855, + "balance_loss_mlp": 1.16135645, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.07711499257167788, + "language_loss": 1.03052521, + "learning_rate": 0.000981379875086876, + "loss": 1.04230392, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.16491699, + "step": 142, + "time_per_iteration": 3.0336825847625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154055, + "balance_loss_mlp": 1.13728189, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.06449204224600169, + "language_loss": 0.98759103, + "learning_rate": 0.0009827695330590185, + "loss": 0.99913156, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.16784668, + "step": 143, + "time_per_iteration": 2.635596990585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131533, + "balance_loss_mlp": 1.11474872, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.07528415949234718, + "language_loss": 0.98083055, + "learning_rate": 0.0009841495069248256, + "loss": 0.9921459, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.16796875, + "step": 144, + "time_per_iteration": 2.9648232460021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123928, + "balance_loss_mlp": 1.10686922, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.10995634154815045, + "language_loss": 0.97452384, + "learning_rate": 0.0009855199307219871, + "loss": 0.98576319, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1706543, + "step": 145, + "time_per_iteration": 2.6601850986480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113445, + "balance_loss_mlp": 1.09632671, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.09468853295775125, + "language_loss": 0.98972148, + "learning_rate": 0.0009868809357244854, + "loss": 1.00085592, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.17138672, + "step": 146, + "time_per_iteration": 2.7714684009552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109349, + "balance_loss_mlp": 1.09192085, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.08177620360389791, + "language_loss": 1.02921426, + "learning_rate": 0.0009882326505180556, + "loss": 1.04030776, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.17443848, + "step": 147, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121816, + "balance_loss_mlp": 1.10459065, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.15200564524835, + "language_loss": 1.01768231, + "learning_rate": 0.0009895752010730906, + "loss": 1.02890062, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.17236328, + "step": 148, + "time_per_iteration": 2.944622755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_mlp": 1.12333786, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.10043611919636293, + "language_loss": 1.0762012, + "learning_rate": 0.0009909087108150867, + "loss": 1.08760118, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.16662598, + "step": 149, + "time_per_iteration": 2.730631113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123808, + "balance_loss_mlp": 1.10708272, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.08772923811196923, + "language_loss": 1.08558857, + "learning_rate": 0.0009922333006927371, + "loss": 1.09682679, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.1673584, + "step": 150, + "time_per_iteration": 2.5662901401519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108027, + "balance_loss_mlp": 1.09107542, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.10678098958344774, + "language_loss": 1.02281368, + "learning_rate": 0.0009935490892437632, + "loss": 1.03389382, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16967773, + "step": 151, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110516, + "balance_loss_mlp": 1.0892458, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.07022496172976629, + "language_loss": 1.00216019, + "learning_rate": 0.0009948561926585687, + "loss": 1.01321173, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15905762, + "step": 152, + "time_per_iteration": 2.762035608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101658, + "balance_loss_mlp": 1.08582664, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.08132441134663608, + "language_loss": 1.04400539, + "learning_rate": 0.0009961547248418122, + "loss": 1.05502188, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.15820312, + "step": 153, + "time_per_iteration": 2.6525814533233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092159, + "balance_loss_mlp": 1.07619703, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.064379562707883, + "language_loss": 1.01020789, + "learning_rate": 0.0009974447974719707, + "loss": 1.02112949, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.1595459, + "step": 154, + "time_per_iteration": 2.814805746078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011076, + "balance_loss_mlp": 1.09136379, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.09363682514066085, + "language_loss": 1.02673674, + "learning_rate": 0.0009987265200589763, + "loss": 1.03781271, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.16235352, + "step": 155, + "time_per_iteration": 2.7394251823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083912, + "balance_loss_mlp": 1.06821227, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.05837038305695058, + "language_loss": 1.02287054, + "learning_rate": 0.001, + "loss": 1.03370976, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.15686035, + "step": 156, + "time_per_iteration": 2.875622272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091789, + "balance_loss_mlp": 1.07507551, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.08525763952586639, + "language_loss": 1.00171304, + "learning_rate": 0.0009999999029413921, + "loss": 1.01263094, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.16723633, + "step": 157, + "time_per_iteration": 2.8360915184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110493, + "balance_loss_mlp": 1.09382772, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.08254544257661527, + "language_loss": 1.01840436, + "learning_rate": 0.0009999996117656068, + "loss": 1.02950931, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.801180124282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096617, + "balance_loss_mlp": 1.08086896, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.070993780506174, + "language_loss": 0.95558536, + "learning_rate": 0.0009999991264727564, + "loss": 0.96655154, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15734863, + "step": 159, + "time_per_iteration": 2.818821668624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096163, + "balance_loss_mlp": 1.08046305, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07077353312716703, + "language_loss": 1.06054807, + "learning_rate": 0.0009999984470630296, + "loss": 1.0715096, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.15686035, + "step": 160, + "time_per_iteration": 2.6040687561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.08113289, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.055279151578571405, + "language_loss": 0.94481659, + "learning_rate": 0.0009999975735366902, + "loss": 0.95578229, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.1541748, + "step": 161, + "time_per_iteration": 3.1012368202209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096261, + "balance_loss_mlp": 1.08034658, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.0762466753512266, + "language_loss": 0.96279925, + "learning_rate": 0.0009999965058940775, + "loss": 0.97376186, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.15905762, + "step": 162, + "time_per_iteration": 3.5481724739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092073, + "balance_loss_mlp": 1.07657552, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.0783935068916601, + "language_loss": 1.02822053, + "learning_rate": 0.0009999952441356057, + "loss": 1.03914118, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.15490723, + "step": 163, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104121, + "balance_loss_mlp": 1.08844459, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.06003254057509557, + "language_loss": 1.03039443, + "learning_rate": 0.000999993788261765, + "loss": 1.04143572, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.15661621, + "step": 164, + "time_per_iteration": 3.625434398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097956, + "balance_loss_mlp": 1.08191097, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.071706058438464, + "language_loss": 1.04424524, + "learning_rate": 0.00099999213827312, + "loss": 1.0552249, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.16040039, + "step": 165, + "time_per_iteration": 2.7834768295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_mlp": 1.09566009, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.12829100736108065, + "language_loss": 0.99657446, + "learning_rate": 0.000999990294170312, + "loss": 1.00768602, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.15478516, + "step": 166, + "time_per_iteration": 2.637387752532959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101169, + "balance_loss_mlp": 1.08545709, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.06852414366650764, + "language_loss": 1.03638864, + "learning_rate": 0.0009999882559540566, + "loss": 1.04740036, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.15698242, + "step": 167, + "time_per_iteration": 2.6875667572021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098336, + "balance_loss_mlp": 1.0833509, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.05076681603646914, + "language_loss": 1.00191641, + "learning_rate": 0.000999986023625145, + "loss": 1.01289976, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.14953613, + "step": 168, + "time_per_iteration": 2.7518744468688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03792956, + "balance_loss_mlp": 3.75500011, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.6529032341502935, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82717371, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.37890625, + "step": 169, + "time_per_iteration": 4.917760133743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167126, + "balance_loss_mlp": 1.15130675, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.09865002272530259, + "language_loss": 1.00644767, + "learning_rate": 0.0009999809766328958, + "loss": 1.01811886, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.15808105, + "step": 170, + "time_per_iteration": 2.65771746635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120248, + "balance_loss_mlp": 1.18527782, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08799874436989415, + "language_loss": 1.02774751, + "learning_rate": 0.0009999781619715177, + "loss": 1.03977239, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.17211914, + "step": 171, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122766, + "balance_loss_mlp": 1.21033943, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08542539222295185, + "language_loss": 1.02671802, + "learning_rate": 0.000999975153201402, + "loss": 1.03899455, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.17321777, + "step": 172, + "time_per_iteration": 2.8269002437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267174, + "balance_loss_mlp": 1.24883962, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.120181629337785, + "language_loss": 1.00698161, + "learning_rate": 0.0009999719503237174, + "loss": 1.01965332, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.18347168, + "step": 173, + "time_per_iteration": 2.758136749267578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254087, + "balance_loss_mlp": 1.23402381, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.13932237496235436, + "language_loss": 1.08850026, + "learning_rate": 0.0009999685533397073, + "loss": 1.10104108, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.20056152, + "step": 174, + "time_per_iteration": 2.6060163974761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268538, + "balance_loss_mlp": 1.24870133, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.0855521850526334, + "language_loss": 1.01282525, + "learning_rate": 0.00099996496225069, + "loss": 1.02551055, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.19824219, + "step": 175, + "time_per_iteration": 2.6688973903656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312845, + "balance_loss_mlp": 1.29124486, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.0738431594221532, + "language_loss": 1.03378773, + "learning_rate": 0.0009999611770580604, + "loss": 1.04691625, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.21606445, + "step": 176, + "time_per_iteration": 2.8642566204071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345291, + "balance_loss_mlp": 1.32329679, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.09985791713424727, + "language_loss": 1.02061462, + "learning_rate": 0.0009999571977632876, + "loss": 1.03406763, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.21984863, + "step": 177, + "time_per_iteration": 2.620537757873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0133899, + "balance_loss_mlp": 1.31619775, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.09257746092300488, + "language_loss": 1.05255055, + "learning_rate": 0.0009999530243679166, + "loss": 1.06594038, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.2277832, + "step": 178, + "time_per_iteration": 2.5526390075683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01321119, + "balance_loss_mlp": 1.29928029, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.07612740556433409, + "language_loss": 1.00229979, + "learning_rate": 0.0009999486568735675, + "loss": 1.0155108, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.21850586, + "step": 179, + "time_per_iteration": 3.084320068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01314096, + "balance_loss_mlp": 1.29238796, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.08380095909791664, + "language_loss": 1.00181103, + "learning_rate": 0.0009999440952819362, + "loss": 1.01495194, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.21716309, + "step": 180, + "time_per_iteration": 3.6467599868774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288371, + "balance_loss_mlp": 1.26746202, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.10452638314540276, + "language_loss": 1.00434995, + "learning_rate": 0.0009999393395947935, + "loss": 1.01723361, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.20935059, + "step": 181, + "time_per_iteration": 2.8092122077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271899, + "balance_loss_mlp": 1.25226557, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.1078936362641923, + "language_loss": 1.03725255, + "learning_rate": 0.0009999343898139858, + "loss": 1.04997146, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.19616699, + "step": 182, + "time_per_iteration": 2.6274633407592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260451, + "balance_loss_mlp": 1.23960137, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.13163794074334914, + "language_loss": 1.02352095, + "learning_rate": 0.0009999292459414348, + "loss": 1.03612542, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.20849609, + "step": 183, + "time_per_iteration": 2.5587446689605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241243, + "balance_loss_mlp": 1.22079897, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.11087783412260319, + "language_loss": 1.06915629, + "learning_rate": 0.0009999239079791374, + "loss": 1.08156872, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.2043457, + "step": 184, + "time_per_iteration": 2.5720386505126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264671, + "balance_loss_mlp": 1.24370217, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.08935796417892215, + "language_loss": 0.99749458, + "learning_rate": 0.0009999183759291659, + "loss": 1.01014113, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.2097168, + "step": 185, + "time_per_iteration": 2.7049641609191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01283684, + "balance_loss_mlp": 1.26222682, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.1506087846083958, + "language_loss": 1.02522779, + "learning_rate": 0.0009999126497936682, + "loss": 1.03806448, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.21459961, + "step": 186, + "time_per_iteration": 2.5040838718414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266329, + "balance_loss_mlp": 1.24443007, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.07597181242921475, + "language_loss": 1.04941225, + "learning_rate": 0.0009999067295748676, + "loss": 1.0620755, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.21899414, + "step": 187, + "time_per_iteration": 2.8635194301605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01276828, + "balance_loss_mlp": 1.25491714, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.10348177684206804, + "language_loss": 1.02588224, + "learning_rate": 0.000999900615275062, + "loss": 1.03865051, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.21911621, + "step": 188, + "time_per_iteration": 2.6797780990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272596, + "balance_loss_mlp": 1.25078082, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.11548780673963775, + "language_loss": 1.08482468, + "learning_rate": 0.0009998943068966256, + "loss": 1.09755063, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.21826172, + "step": 189, + "time_per_iteration": 2.446465253829956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282253, + "balance_loss_mlp": 1.25919747, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.10548213053156746, + "language_loss": 1.03159523, + "learning_rate": 0.0009998878044420072, + "loss": 1.04441762, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.23071289, + "step": 190, + "time_per_iteration": 2.715607166290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01282371, + "balance_loss_mlp": 1.2598052, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.11932481378659279, + "language_loss": 0.98991239, + "learning_rate": 0.0009998811079137318, + "loss": 1.00273609, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.22558594, + "step": 191, + "time_per_iteration": 2.6401267051696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260121, + "balance_loss_mlp": 1.2387228, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.10247339740719702, + "language_loss": 1.0056088, + "learning_rate": 0.0009998742173143987, + "loss": 1.01821005, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.21411133, + "step": 192, + "time_per_iteration": 2.6355819702148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261897, + "balance_loss_mlp": 1.24107122, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.19022984523402262, + "language_loss": 1.00051641, + "learning_rate": 0.0009998671326466833, + "loss": 1.01313543, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.20837402, + "step": 193, + "time_per_iteration": 3.009938955307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264185, + "balance_loss_mlp": 1.24278712, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.16347382701944235, + "language_loss": 1.01202989, + "learning_rate": 0.0009998598539133362, + "loss": 1.02467179, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.21386719, + "step": 194, + "time_per_iteration": 3.032041311264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01320641, + "balance_loss_mlp": 1.29752648, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.09447382654807665, + "language_loss": 1.02349281, + "learning_rate": 0.0009998523811171828, + "loss": 1.0366993, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.23132324, + "step": 195, + "time_per_iteration": 2.5140883922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01385941, + "balance_loss_mlp": 1.36191988, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.174477259749112, + "language_loss": 1.02751505, + "learning_rate": 0.0009998447142611248, + "loss": 1.04137444, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.24047852, + "step": 196, + "time_per_iteration": 2.6540584564208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01374932, + "balance_loss_mlp": 1.3512454, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.19785353386832685, + "language_loss": 0.95925725, + "learning_rate": 0.0009998368533481387, + "loss": 0.97300661, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.23657227, + "step": 197, + "time_per_iteration": 3.0361931324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132819, + "balance_loss_mlp": 1.30602896, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.07201942870831356, + "language_loss": 0.98943031, + "learning_rate": 0.0009998287983812762, + "loss": 1.00271225, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.22155762, + "step": 198, + "time_per_iteration": 2.8737523555755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316145, + "balance_loss_mlp": 1.2943778, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.07974969111573339, + "language_loss": 1.04380584, + "learning_rate": 0.0009998205493636646, + "loss": 1.05696738, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.21789551, + "step": 199, + "time_per_iteration": 2.6439247131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323551, + "balance_loss_mlp": 1.30098474, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.08769997267084173, + "language_loss": 0.97346306, + "learning_rate": 0.0009998121062985063, + "loss": 0.98669851, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.22583008, + "step": 200, + "time_per_iteration": 2.738266944885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01342622, + "balance_loss_mlp": 1.3199718, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.1288031319123161, + "language_loss": 0.99576765, + "learning_rate": 0.0009998034691890794, + "loss": 1.0091939, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.2265625, + "step": 201, + "time_per_iteration": 2.815068244934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01322045, + "balance_loss_mlp": 1.29940701, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.1480539814519598, + "language_loss": 1.04135096, + "learning_rate": 0.0009997946380387369, + "loss": 1.05457139, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.22619629, + "step": 202, + "time_per_iteration": 2.6735482215881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272248, + "balance_loss_mlp": 1.24913371, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.10058314649993264, + "language_loss": 1.06271195, + "learning_rate": 0.0009997856128509076, + "loss": 1.07543445, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.23132324, + "step": 203, + "time_per_iteration": 2.858497142791748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238452, + "balance_loss_mlp": 1.21574211, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.07713628959924962, + "language_loss": 1.01241136, + "learning_rate": 0.0009997763936290952, + "loss": 1.02479577, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.22705078, + "step": 204, + "time_per_iteration": 2.5389275550842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01254542, + "balance_loss_mlp": 1.22998452, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.10588145989282294, + "language_loss": 1.06408, + "learning_rate": 0.0009997669803768789, + "loss": 1.07662535, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.24560547, + "step": 205, + "time_per_iteration": 2.8234684467315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249653, + "balance_loss_mlp": 1.2262044, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.1260931618436919, + "language_loss": 1.01299226, + "learning_rate": 0.0009997573730979134, + "loss": 1.02548885, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.23461914, + "step": 206, + "time_per_iteration": 2.7586512565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03194186, + "balance_loss_mlp": 2.85391545, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.3208039945146043, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82387388, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 3.40625, + "step": 207, + "time_per_iteration": 4.668841123580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287285, + "balance_loss_mlp": 1.26177394, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.15196225676568717, + "language_loss": 1.00590456, + "learning_rate": 0.0009997375764747294, + "loss": 1.01877737, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.25512695, + "step": 208, + "time_per_iteration": 3.0460121631622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275833, + "balance_loss_mlp": 1.25076318, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.09666220749273949, + "language_loss": 0.97800297, + "learning_rate": 0.0009997273871381967, + "loss": 0.99076128, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.25085449, + "step": 209, + "time_per_iteration": 2.7027134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126115, + "balance_loss_mlp": 1.23683095, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.09901686865787228, + "language_loss": 1.02878523, + "learning_rate": 0.0009997170037902862, + "loss": 1.04139662, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.2434082, + "step": 210, + "time_per_iteration": 2.7203080654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228259, + "balance_loss_mlp": 1.20371389, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.11653422944125434, + "language_loss": 1.0505805, + "learning_rate": 0.0009997064264350292, + "loss": 1.06286311, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.24536133, + "step": 211, + "time_per_iteration": 2.8774335384368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239303, + "balance_loss_mlp": 1.21149194, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.06455145782580095, + "language_loss": 0.99545413, + "learning_rate": 0.0009996956550765317, + "loss": 1.00784707, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.27770996, + "step": 212, + "time_per_iteration": 2.6957452297210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222017, + "balance_loss_mlp": 1.19556475, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.1270361519775568, + "language_loss": 0.94278163, + "learning_rate": 0.0009996846897189762, + "loss": 0.95500183, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.26452637, + "step": 213, + "time_per_iteration": 2.6380836963653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223712, + "balance_loss_mlp": 1.19798708, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.1000627367739684, + "language_loss": 1.00583601, + "learning_rate": 0.0009996735303666193, + "loss": 1.01807308, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.25720215, + "step": 214, + "time_per_iteration": 2.7703840732574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205703, + "balance_loss_mlp": 1.18167019, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.10044224354438386, + "language_loss": 1.02544665, + "learning_rate": 0.0009996621770237937, + "loss": 1.0375036, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.24035645, + "step": 215, + "time_per_iteration": 2.747954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194773, + "balance_loss_mlp": 1.17049026, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.07439915791739656, + "language_loss": 0.98184484, + "learning_rate": 0.0009996506296949073, + "loss": 0.99379259, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.24267578, + "step": 216, + "time_per_iteration": 2.957000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178169, + "balance_loss_mlp": 1.15088165, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.07228572223559625, + "language_loss": 0.98363817, + "learning_rate": 0.0009996388883844428, + "loss": 0.99541986, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.27294922, + "step": 217, + "time_per_iteration": 2.625004529953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163207, + "balance_loss_mlp": 1.13855505, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.0709878545566638, + "language_loss": 1.02471972, + "learning_rate": 0.0009996269530969588, + "loss": 1.0363518, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.24645996, + "step": 218, + "time_per_iteration": 2.577202796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153651, + "balance_loss_mlp": 1.13201451, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.081462998095588, + "language_loss": 1.00934064, + "learning_rate": 0.0009996148238370888, + "loss": 1.02087712, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.21655273, + "step": 219, + "time_per_iteration": 2.75849986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128401, + "balance_loss_mlp": 1.10447621, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.08476688765369866, + "language_loss": 0.96862441, + "learning_rate": 0.0009996025006095421, + "loss": 0.97990847, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.23962402, + "step": 220, + "time_per_iteration": 3.316199541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03366003, + "balance_loss_mlp": 3.11881113, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.3512460928075295, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81149149, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.46875, + "step": 221, + "time_per_iteration": 5.585368633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113685, + "balance_loss_mlp": 1.11290038, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.07993960649684186, + "language_loss": 0.97486591, + "learning_rate": 0.0009995772722706307, + "loss": 0.98623443, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.23950195, + "step": 222, + "time_per_iteration": 2.8408098220825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182736, + "balance_loss_mlp": 1.15682042, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.11511868264512252, + "language_loss": 1.11370254, + "learning_rate": 0.0009995643671690604, + "loss": 1.12553, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.25927734, + "step": 223, + "time_per_iteration": 2.4770917892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194194, + "balance_loss_mlp": 1.16939855, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.13725027562770867, + "language_loss": 0.98326594, + "learning_rate": 0.0009995512681194023, + "loss": 0.99520785, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.24804688, + "step": 224, + "time_per_iteration": 2.901346445083618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011754, + "balance_loss_mlp": 1.14950812, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.06929706927237234, + "language_loss": 0.96731412, + "learning_rate": 0.0009995379751267417, + "loss": 0.97906816, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.25891113, + "step": 225, + "time_per_iteration": 3.238084316253662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170568, + "balance_loss_mlp": 1.14375746, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.07435013646684872, + "language_loss": 0.98210657, + "learning_rate": 0.0009995244881962398, + "loss": 0.99381226, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.26843262, + "step": 226, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162667, + "balance_loss_mlp": 1.1352731, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.08505882003862496, + "language_loss": 0.98532695, + "learning_rate": 0.0009995108073331323, + "loss": 0.99695361, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.27416992, + "step": 227, + "time_per_iteration": 2.621875524520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167442, + "balance_loss_mlp": 1.13921285, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.06754882710561792, + "language_loss": 1.01820612, + "learning_rate": 0.0009994969325427309, + "loss": 1.02988064, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.28222656, + "step": 228, + "time_per_iteration": 2.6876742839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182366, + "balance_loss_mlp": 1.1523968, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.06680156886068128, + "language_loss": 0.97377843, + "learning_rate": 0.0009994828638304218, + "loss": 0.98560202, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.29980469, + "step": 229, + "time_per_iteration": 2.6631240844726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198543, + "balance_loss_mlp": 1.16969442, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08411507650901279, + "language_loss": 1.03665459, + "learning_rate": 0.0009994686012016675, + "loss": 1.04864001, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.28833008, + "step": 230, + "time_per_iteration": 2.499721050262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122651, + "balance_loss_mlp": 1.19675517, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.09876086989002084, + "language_loss": 1.02814984, + "learning_rate": 0.000999454144662005, + "loss": 1.04041505, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.29711914, + "step": 231, + "time_per_iteration": 2.911175489425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224486, + "balance_loss_mlp": 1.19466019, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.10057378611284366, + "language_loss": 0.96611959, + "learning_rate": 0.0009994394942170468, + "loss": 0.97836453, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.2980957, + "step": 232, + "time_per_iteration": 2.7470107078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012208, + "balance_loss_mlp": 1.19083118, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.06893435559553937, + "language_loss": 0.94648588, + "learning_rate": 0.0009994246498724808, + "loss": 0.95869386, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.29956055, + "step": 233, + "time_per_iteration": 2.7436845302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206879, + "balance_loss_mlp": 1.17860246, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.08371813790363081, + "language_loss": 0.97381985, + "learning_rate": 0.00099940961163407, + "loss": 0.9858886, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.28295898, + "step": 234, + "time_per_iteration": 2.876405715942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119556, + "balance_loss_mlp": 1.16654444, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.08201306351282911, + "language_loss": 1.00061524, + "learning_rate": 0.0009993943795076528, + "loss": 1.01257086, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.2902832, + "step": 235, + "time_per_iteration": 2.6432723999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168701, + "balance_loss_mlp": 1.13873136, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.12052684551098608, + "language_loss": 1.01575673, + "learning_rate": 0.0009993789534991427, + "loss": 1.02744377, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.29907227, + "step": 236, + "time_per_iteration": 2.4240100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136514, + "balance_loss_mlp": 1.10954857, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.0561052231541492, + "language_loss": 0.96778214, + "learning_rate": 0.0009993633336145287, + "loss": 0.97914726, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.26977539, + "step": 237, + "time_per_iteration": 2.638850688934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130626, + "balance_loss_mlp": 1.10363674, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06334524880145487, + "language_loss": 1.0125159, + "learning_rate": 0.0009993475198598752, + "loss": 1.02382219, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.26989746, + "step": 238, + "time_per_iteration": 3.0008814334869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109235, + "balance_loss_mlp": 1.08395052, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.08922144233736891, + "language_loss": 0.97379184, + "learning_rate": 0.0009993315122413212, + "loss": 0.98488414, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.25305176, + "step": 239, + "time_per_iteration": 2.620474100112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121059, + "balance_loss_mlp": 1.09551263, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.09980166654849132, + "language_loss": 0.97848725, + "learning_rate": 0.0009993153107650818, + "loss": 0.98969781, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.25537109, + "step": 240, + "time_per_iteration": 2.5547702312469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.08719897, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.09180653876933564, + "language_loss": 0.96700346, + "learning_rate": 0.0009992989154374468, + "loss": 0.97813869, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.2635498, + "step": 241, + "time_per_iteration": 2.5366051197052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105107, + "balance_loss_mlp": 1.07833242, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07962621760937992, + "language_loss": 1.03585958, + "learning_rate": 0.0009992823262647817, + "loss": 1.04691052, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26782227, + "step": 242, + "time_per_iteration": 2.726482391357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100596, + "balance_loss_mlp": 1.07384586, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0814561151731407, + "language_loss": 0.97787237, + "learning_rate": 0.0009992655432535264, + "loss": 0.98887837, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26782227, + "step": 243, + "time_per_iteration": 2.765273332595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.07214487, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.0750228199707575, + "language_loss": 0.98452473, + "learning_rate": 0.0009992485664101973, + "loss": 0.99550676, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.26037598, + "step": 244, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_mlp": 1.08732188, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.08629455000399752, + "language_loss": 1.00806224, + "learning_rate": 0.000999231395741385, + "loss": 1.01922584, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.2902832, + "step": 245, + "time_per_iteration": 3.1403207778930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117221, + "balance_loss_mlp": 1.08958876, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.07729478564770192, + "language_loss": 0.986202, + "learning_rate": 0.0009992140312537557, + "loss": 0.99737418, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.27661133, + "step": 246, + "time_per_iteration": 2.7038867473602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111137, + "balance_loss_mlp": 1.08410013, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.08592122791377885, + "language_loss": 0.93525487, + "learning_rate": 0.000999196472954051, + "loss": 0.94636625, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.27050781, + "step": 247, + "time_per_iteration": 2.9575722217559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0471772, + "balance_loss_mlp": 4.51020002, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.4683520251238934, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.84142572, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.078125, + "step": 248, + "time_per_iteration": 5.452638387680054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200681, + "balance_loss_mlp": 1.17252362, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.13106789232715058, + "language_loss": 1.01118052, + "learning_rate": 0.0009991607749457578, + "loss": 1.02318728, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.28173828, + "step": 249, + "time_per_iteration": 2.5066423416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256525, + "balance_loss_mlp": 1.22541094, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.1327983626735717, + "language_loss": 0.98959935, + "learning_rate": 0.0009991426352510286, + "loss": 1.0021646, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31103516, + "step": 250, + "time_per_iteration": 3.0130999088287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250303, + "balance_loss_mlp": 1.22023845, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.11435576550904086, + "language_loss": 1.00191545, + "learning_rate": 0.0009991243017719422, + "loss": 1.01441836, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.30053711, + "step": 251, + "time_per_iteration": 2.6584134101867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190864, + "balance_loss_mlp": 1.16108572, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.08343855539664048, + "language_loss": 0.94829702, + "learning_rate": 0.0009991057745156165, + "loss": 0.96020567, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.29760742, + "step": 252, + "time_per_iteration": 2.6125926971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03043524, + "balance_loss_mlp": 2.97905564, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.48807257564671885, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.84954512, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 0.64453125, + "step": 253, + "time_per_iteration": 5.0318169593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205448, + "balance_loss_mlp": 1.17426276, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.15081419889398517, + "language_loss": 1.02692831, + "learning_rate": 0.0009990681387000943, + "loss": 1.03898275, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.31152344, + "step": 254, + "time_per_iteration": 2.7569847106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231579, + "balance_loss_mlp": 1.20053661, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.10308088004196624, + "language_loss": 0.98562324, + "learning_rate": 0.0009990490301555093, + "loss": 0.99793905, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.31054688, + "step": 255, + "time_per_iteration": 2.9826152324676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01973911, + "balance_loss_mlp": 1.89609146, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.14603633134579833, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.8118906, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.77734375, + "step": 256, + "time_per_iteration": 4.873262643814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01994546, + "balance_loss_mlp": 1.91596293, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.1290240934598827, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.81237286, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.78515625, + "step": 257, + "time_per_iteration": 4.981585502624512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01945028, + "balance_loss_mlp": 1.87979627, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.10634084131038181, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71920907, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.65234375, + "step": 258, + "time_per_iteration": 4.869063138961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231874, + "balance_loss_mlp": 1.20192897, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.1721871775998346, + "language_loss": 0.93400717, + "learning_rate": 0.0009989706585723202, + "loss": 0.9463259, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.29956055, + "step": 259, + "time_per_iteration": 2.828618049621582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226271, + "balance_loss_mlp": 1.1963017, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.13941406884376095, + "language_loss": 0.9926306, + "learning_rate": 0.0009989505813633442, + "loss": 1.0048933, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29931641, + "step": 260, + "time_per_iteration": 2.7033097743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167993, + "balance_loss_mlp": 1.13833416, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.078052738900574, + "language_loss": 0.99695522, + "learning_rate": 0.000998930310444573, + "loss": 1.00863528, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.29663086, + "step": 261, + "time_per_iteration": 2.739182949066162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120344, + "balance_loss_mlp": 1.09104276, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.10502347912179442, + "language_loss": 0.97120214, + "learning_rate": 0.0009989098458238765, + "loss": 0.98240554, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.29296875, + "step": 262, + "time_per_iteration": 2.81984806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109888, + "balance_loss_mlp": 1.07910872, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.1022419163820973, + "language_loss": 0.96531391, + "learning_rate": 0.0009988891875091998, + "loss": 0.97641277, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.30761719, + "step": 263, + "time_per_iteration": 2.816471576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119537, + "balance_loss_mlp": 1.08949661, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07930699495869925, + "language_loss": 0.91512978, + "learning_rate": 0.0009988683355085636, + "loss": 0.92632508, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.30004883, + "step": 264, + "time_per_iteration": 2.7963876724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116935, + "balance_loss_mlp": 1.1386174, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.1164382368145933, + "language_loss": 1.00062299, + "learning_rate": 0.000998847289830063, + "loss": 1.01231647, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.30688477, + "step": 265, + "time_per_iteration": 2.8219666481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180582, + "balance_loss_mlp": 1.14922965, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.14769195776656788, + "language_loss": 0.92838919, + "learning_rate": 0.0009988260504818682, + "loss": 0.94019508, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.31323242, + "step": 266, + "time_per_iteration": 2.527987480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159175, + "balance_loss_mlp": 1.12753642, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.1223822648996979, + "language_loss": 0.99088645, + "learning_rate": 0.000998804617472226, + "loss": 1.00247824, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.31616211, + "step": 267, + "time_per_iteration": 2.6469640731811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129085, + "balance_loss_mlp": 1.09735131, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.09065118463065669, + "language_loss": 0.94319087, + "learning_rate": 0.0009987829908094568, + "loss": 0.95448172, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.31713867, + "step": 268, + "time_per_iteration": 2.821777105331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.10014248, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.11182301329739544, + "language_loss": 1.00247467, + "learning_rate": 0.0009987611705019569, + "loss": 1.01378679, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.31030273, + "step": 269, + "time_per_iteration": 4.288902521133423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117374, + "balance_loss_mlp": 1.08509207, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06856601771993416, + "language_loss": 0.99786204, + "learning_rate": 0.0009987391565581978, + "loss": 1.00903583, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.32275391, + "step": 270, + "time_per_iteration": 2.634683132171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119573, + "balance_loss_mlp": 1.08681393, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08930504281721281, + "language_loss": 0.92515171, + "learning_rate": 0.000998716948986726, + "loss": 0.93634748, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.32763672, + "step": 271, + "time_per_iteration": 2.7899389266967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120606, + "balance_loss_mlp": 1.08970654, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.10701715244821809, + "language_loss": 0.94677854, + "learning_rate": 0.0009986945477961633, + "loss": 0.95798463, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.30859375, + "step": 272, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108581, + "balance_loss_mlp": 1.07789683, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.050944004487463904, + "language_loss": 1.00078344, + "learning_rate": 0.0009986719529952066, + "loss": 1.01186931, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.30639648, + "step": 273, + "time_per_iteration": 2.85548734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097085, + "balance_loss_mlp": 1.06668699, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.06235958359183371, + "language_loss": 0.99016273, + "learning_rate": 0.000998649164592628, + "loss": 1.00113368, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.3034668, + "step": 274, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104755, + "balance_loss_mlp": 1.07507145, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.10062534885586208, + "language_loss": 0.96764064, + "learning_rate": 0.0009986261825972748, + "loss": 0.97868812, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.29663086, + "step": 275, + "time_per_iteration": 2.6752514839172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107504, + "balance_loss_mlp": 1.07798743, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.08071716286169645, + "language_loss": 0.98941195, + "learning_rate": 0.000998603007018069, + "loss": 1.00048697, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.29541016, + "step": 276, + "time_per_iteration": 2.8236005306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118213, + "balance_loss_mlp": 1.08767152, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07622563991542974, + "language_loss": 0.96909779, + "learning_rate": 0.0009985796378640089, + "loss": 0.98027998, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.30517578, + "step": 277, + "time_per_iteration": 2.7089598178863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110773, + "balance_loss_mlp": 1.07940567, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07841820465234402, + "language_loss": 0.95740211, + "learning_rate": 0.0009985560751441665, + "loss": 0.96847939, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.28320312, + "step": 278, + "time_per_iteration": 2.834015369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108783, + "balance_loss_mlp": 1.07831299, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.07361828218816212, + "language_loss": 0.9799974, + "learning_rate": 0.00099853231886769, + "loss": 0.99108523, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.30444336, + "step": 279, + "time_per_iteration": 2.8200764656066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.07937431, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.07512382427920342, + "language_loss": 0.98746061, + "learning_rate": 0.0009985083690438024, + "loss": 0.99854696, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.29223633, + "step": 280, + "time_per_iteration": 2.75639271736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113716, + "balance_loss_mlp": 1.08310306, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.09326847112688041, + "language_loss": 0.89231437, + "learning_rate": 0.0009984842256818016, + "loss": 0.90345156, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.3059082, + "step": 281, + "time_per_iteration": 3.0839526653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121888, + "balance_loss_mlp": 1.09182298, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.062071298051891176, + "language_loss": 0.99695373, + "learning_rate": 0.0009984598887910613, + "loss": 1.00817263, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.30029297, + "step": 282, + "time_per_iteration": 2.7197024822235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123523, + "balance_loss_mlp": 1.09283888, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.08448232068887077, + "language_loss": 0.95169044, + "learning_rate": 0.0009984353583810297, + "loss": 0.96292561, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.30664062, + "step": 283, + "time_per_iteration": 2.8440537452697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127605, + "balance_loss_mlp": 1.09811282, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.07597313108733957, + "language_loss": 0.97190034, + "learning_rate": 0.0009984106344612302, + "loss": 0.98317641, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.29492188, + "step": 284, + "time_per_iteration": 2.7592926025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139329, + "balance_loss_mlp": 1.10843039, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.08116128158624439, + "language_loss": 0.93187618, + "learning_rate": 0.0009983857170412615, + "loss": 0.94326949, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.30859375, + "step": 285, + "time_per_iteration": 2.99845027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151704, + "balance_loss_mlp": 1.12080526, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.07339397608587311, + "language_loss": 0.92728812, + "learning_rate": 0.000998360606130798, + "loss": 0.93880516, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.30859375, + "step": 286, + "time_per_iteration": 2.835510492324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.020519, + "balance_loss_mlp": 2.03492451, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.132236598943482, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71125019, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.16992188, + "step": 287, + "time_per_iteration": 4.860529184341431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144586, + "balance_loss_mlp": 1.11304367, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.09086643312306038, + "language_loss": 0.98494267, + "learning_rate": 0.0009983098038774552, + "loss": 0.99638855, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.31518555, + "step": 288, + "time_per_iteration": 2.7743642330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0188948, + "balance_loss_mlp": 1.87336278, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.09551417356683237, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80059707, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.16113281, + "step": 289, + "time_per_iteration": 4.792251348495483 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132847, + "balance_loss_mlp": 1.10242462, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.0647793178171594, + "language_loss": 0.95675349, + "learning_rate": 0.0009982582277800948, + "loss": 0.96808195, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.30371094, + "step": 290, + "time_per_iteration": 2.6280908584594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129981, + "balance_loss_mlp": 1.09931993, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.06216394577533418, + "language_loss": 1.02967191, + "learning_rate": 0.0009982321495648908, + "loss": 1.04097176, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.30639648, + "step": 291, + "time_per_iteration": 2.823817491531372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152465, + "balance_loss_mlp": 1.11880052, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.0720353654192766, + "language_loss": 0.94905466, + "learning_rate": 0.0009982058779188115, + "loss": 0.96057928, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.33666992, + "step": 292, + "time_per_iteration": 2.716226577758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143466, + "balance_loss_mlp": 1.11175609, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.0752196942414692, + "language_loss": 1.02053797, + "learning_rate": 0.0009981794128520567, + "loss": 1.03197265, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.31689453, + "step": 293, + "time_per_iteration": 2.80366587638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140969, + "balance_loss_mlp": 1.10878265, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08694547176554791, + "language_loss": 0.9927811, + "learning_rate": 0.000998152754374901, + "loss": 1.0041908, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.32202148, + "step": 294, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09493268, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06320951422559969, + "language_loss": 0.95261526, + "learning_rate": 0.0009981259024976943, + "loss": 0.96387053, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.30566406, + "step": 295, + "time_per_iteration": 2.709763765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130922, + "balance_loss_mlp": 1.1013341, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.09363516749561916, + "language_loss": 0.92460728, + "learning_rate": 0.0009980988572308612, + "loss": 0.93591654, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.2956543, + "step": 296, + "time_per_iteration": 2.975036859512329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.07781124, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.09684297288520326, + "language_loss": 0.95852935, + "learning_rate": 0.0009980716185849015, + "loss": 0.96959662, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.28881836, + "step": 297, + "time_per_iteration": 2.9913201332092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121697, + "balance_loss_mlp": 1.09196591, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.06404931541311756, + "language_loss": 0.92133576, + "learning_rate": 0.0009980441865703904, + "loss": 0.9325527, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.29711914, + "step": 298, + "time_per_iteration": 2.660911798477173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118174, + "balance_loss_mlp": 1.08896804, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.07725734784298466, + "language_loss": 1.00405884, + "learning_rate": 0.000998016561197978, + "loss": 1.01524067, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.29150391, + "step": 299, + "time_per_iteration": 2.7028987407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115684, + "balance_loss_mlp": 1.0875026, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.0924919324941274, + "language_loss": 0.92369866, + "learning_rate": 0.0009979887424783895, + "loss": 0.93485552, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.28173828, + "step": 300, + "time_per_iteration": 2.920323610305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121145, + "balance_loss_mlp": 1.09222448, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.08285851214595771, + "language_loss": 0.91748977, + "learning_rate": 0.0009979607304224248, + "loss": 0.92870122, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.2890625, + "step": 301, + "time_per_iteration": 2.725109815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124265, + "balance_loss_mlp": 1.09512997, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.08389393001078431, + "language_loss": 0.98122084, + "learning_rate": 0.000997932525040959, + "loss": 0.99246347, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.29101562, + "step": 302, + "time_per_iteration": 2.6472513675689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102086, + "balance_loss_mlp": 1.07419097, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.09664842170862178, + "language_loss": 1.00482607, + "learning_rate": 0.000997904126344943, + "loss": 1.01584697, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.27880859, + "step": 303, + "time_per_iteration": 2.6413466930389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108073, + "balance_loss_mlp": 1.07920086, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.07742483031734765, + "language_loss": 0.96304786, + "learning_rate": 0.0009978755343454018, + "loss": 0.9741286, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.28881836, + "step": 304, + "time_per_iteration": 2.7825212478637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108843, + "balance_loss_mlp": 1.0789448, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.09214287188489759, + "language_loss": 0.97051907, + "learning_rate": 0.0009978467490534355, + "loss": 0.98160744, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.29858398, + "step": 305, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_mlp": 1.0759151, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.07804737007565601, + "language_loss": 0.94819117, + "learning_rate": 0.00099781777048022, + "loss": 0.95924759, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.296875, + "step": 306, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095659, + "balance_loss_mlp": 1.06554723, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.08882969665455022, + "language_loss": 0.96051329, + "learning_rate": 0.0009977885986370057, + "loss": 0.97146988, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.30126953, + "step": 307, + "time_per_iteration": 2.551680088043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101228, + "balance_loss_mlp": 1.0711869, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.07969081592203556, + "language_loss": 0.92546368, + "learning_rate": 0.000997759233535118, + "loss": 0.93647587, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.30029297, + "step": 308, + "time_per_iteration": 2.81258487701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118732, + "balance_loss_mlp": 1.08861959, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.08786467203130244, + "language_loss": 0.97749913, + "learning_rate": 0.0009977296751859576, + "loss": 0.98868644, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.30102539, + "step": 309, + "time_per_iteration": 2.7263362407684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105319, + "balance_loss_mlp": 1.07611227, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.06446924521708428, + "language_loss": 1.00202072, + "learning_rate": 0.0009976999236009998, + "loss": 1.01307392, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.29174805, + "step": 310, + "time_per_iteration": 2.762798309326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104, + "balance_loss_mlp": 1.0751754, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.07707725190270151, + "language_loss": 1.00980616, + "learning_rate": 0.0009976699787917955, + "loss": 1.02084613, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.2878418, + "step": 311, + "time_per_iteration": 2.681075096130371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018517, + "balance_loss_mlp": 1.99772644, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.13809188064678232, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75461507, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.20800781, + "step": 312, + "time_per_iteration": 4.931787014007568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08445871, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.08749443672960691, + "language_loss": 0.93570709, + "learning_rate": 0.0009976095095472243, + "loss": 0.94685858, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.30688477, + "step": 313, + "time_per_iteration": 2.5869529247283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101349, + "balance_loss_mlp": 1.07152247, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.1052711311589574, + "language_loss": 0.94373065, + "learning_rate": 0.0009975789851353334, + "loss": 0.95474416, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.29785156, + "step": 314, + "time_per_iteration": 2.825021505355835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091881, + "balance_loss_mlp": 1.06434321, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.0790023799752532, + "language_loss": 0.96930784, + "learning_rate": 0.0009975482675461487, + "loss": 0.98022664, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.27563477, + "step": 315, + "time_per_iteration": 2.657176971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092449, + "balance_loss_mlp": 1.06493592, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.08103250083402935, + "language_loss": 0.94523442, + "learning_rate": 0.0009975173567915952, + "loss": 0.95615894, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.27502441, + "step": 316, + "time_per_iteration": 2.7485179901123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087945, + "balance_loss_mlp": 1.06031179, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.09749512289660646, + "language_loss": 0.88217789, + "learning_rate": 0.000997486252883674, + "loss": 0.89305735, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.27685547, + "step": 317, + "time_per_iteration": 2.848203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083831, + "balance_loss_mlp": 1.05665123, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.0666962391969605, + "language_loss": 0.94262481, + "learning_rate": 0.0009974549558344602, + "loss": 0.95346314, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.27197266, + "step": 318, + "time_per_iteration": 3.6451311111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095985, + "balance_loss_mlp": 1.06921029, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08376464388690433, + "language_loss": 1.02536392, + "learning_rate": 0.000997423465656105, + "loss": 1.03632367, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.26831055, + "step": 319, + "time_per_iteration": 2.7345075607299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091659, + "balance_loss_mlp": 1.06395483, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.0893807265100656, + "language_loss": 1.00347686, + "learning_rate": 0.0009973917823608335, + "loss": 1.01439345, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.27734375, + "step": 320, + "time_per_iteration": 2.59576153755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092889, + "balance_loss_mlp": 1.0656141, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.0805868867251315, + "language_loss": 0.95831037, + "learning_rate": 0.0009973599059609462, + "loss": 0.96923929, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.27294922, + "step": 321, + "time_per_iteration": 2.7188515663146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098538, + "balance_loss_mlp": 1.07090497, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.07327098118113982, + "language_loss": 0.93067813, + "learning_rate": 0.000997327836468819, + "loss": 0.94166344, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.27685547, + "step": 322, + "time_per_iteration": 2.6020476818084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_mlp": 1.08469939, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.08699924077148347, + "language_loss": 0.95677376, + "learning_rate": 0.000997295573896902, + "loss": 0.96790254, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.28137207, + "step": 323, + "time_per_iteration": 2.829726457595825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01600081, + "balance_loss_mlp": 1.58253336, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.0733345350087818, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.82796121, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.17578125, + "step": 324, + "time_per_iteration": 4.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01522296, + "balance_loss_mlp": 1.50503409, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.05691363452686859, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80094236, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.17285156, + "step": 325, + "time_per_iteration": 4.9186623096466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221563, + "balance_loss_mlp": 1.19023478, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.14041524981394118, + "language_loss": 0.90815508, + "learning_rate": 0.000997197627828043, + "loss": 0.9203707, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.31323242, + "step": 326, + "time_per_iteration": 2.5453081130981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200774, + "balance_loss_mlp": 1.17032802, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.12119005069833769, + "language_loss": 0.85965139, + "learning_rate": 0.0009971645930629716, + "loss": 0.87165916, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.30419922, + "step": 327, + "time_per_iteration": 2.7031009197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169691, + "balance_loss_mlp": 1.13969803, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.07816671551275867, + "language_loss": 0.99088198, + "learning_rate": 0.0009971313652814872, + "loss": 1.00257885, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.29956055, + "step": 328, + "time_per_iteration": 2.8222203254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157161, + "balance_loss_mlp": 1.12542796, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.09350719298211221, + "language_loss": 0.96469927, + "learning_rate": 0.0009970979444964903, + "loss": 0.97627091, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.31713867, + "step": 329, + "time_per_iteration": 2.965010643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142135, + "balance_loss_mlp": 1.11214232, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.10929900711039164, + "language_loss": 0.9773742, + "learning_rate": 0.0009970643307209556, + "loss": 0.98879552, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29980469, + "step": 330, + "time_per_iteration": 2.816967248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122149, + "balance_loss_mlp": 1.09279943, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.09151857562667157, + "language_loss": 0.94555062, + "learning_rate": 0.0009970305239679334, + "loss": 0.95677209, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.29321289, + "step": 331, + "time_per_iteration": 2.8171606063842773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103955, + "balance_loss_mlp": 1.07594109, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.0852127129346853, + "language_loss": 0.98894572, + "learning_rate": 0.0009969965242505483, + "loss": 0.99998534, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.28027344, + "step": 332, + "time_per_iteration": 2.663892984390259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.08111989, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.06505292490812643, + "language_loss": 0.94837928, + "learning_rate": 0.0009969623315820007, + "loss": 0.9594804, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.28979492, + "step": 333, + "time_per_iteration": 2.7053513526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100319, + "balance_loss_mlp": 1.07256722, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.09842187194277592, + "language_loss": 0.95016736, + "learning_rate": 0.000996927945975565, + "loss": 0.96117055, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27758789, + "step": 334, + "time_per_iteration": 2.599308490753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113066, + "balance_loss_mlp": 1.08405077, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.0758688902805758, + "language_loss": 0.9173829, + "learning_rate": 0.0009968933674445906, + "loss": 0.92851353, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.29003906, + "step": 335, + "time_per_iteration": 2.6885735988616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117272, + "balance_loss_mlp": 1.08863783, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.08483114639707492, + "language_loss": 0.94787967, + "learning_rate": 0.0009968585960025028, + "loss": 0.95905232, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.28613281, + "step": 336, + "time_per_iteration": 3.0145304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01664619, + "balance_loss_mlp": 1.64468718, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.07989076612991787, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79317814, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.19921875, + "step": 337, + "time_per_iteration": 4.812415361404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113857, + "balance_loss_mlp": 1.08729684, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.10710041073234706, + "language_loss": 0.93311036, + "learning_rate": 0.0009967884744390583, + "loss": 0.94424891, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.26611328, + "step": 338, + "time_per_iteration": 3.551198959350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_mlp": 1.07226825, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.09192445713744875, + "language_loss": 0.93620086, + "learning_rate": 0.0009967531243449256, + "loss": 0.94719481, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.27148438, + "step": 339, + "time_per_iteration": 2.659802198410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093825, + "balance_loss_mlp": 1.06592965, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.08159898153834201, + "language_loss": 1.01212323, + "learning_rate": 0.000996717581394126, + "loss": 1.02306151, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.27905273, + "step": 340, + "time_per_iteration": 2.570789337158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085985, + "balance_loss_mlp": 1.05887651, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.08632134404445381, + "language_loss": 1.01338696, + "learning_rate": 0.000996681845600459, + "loss": 1.02424693, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.27124023, + "step": 341, + "time_per_iteration": 2.676576852798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092801, + "balance_loss_mlp": 1.06526327, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.09337377055156564, + "language_loss": 0.93410671, + "learning_rate": 0.0009966459169777982, + "loss": 0.94503474, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.27563477, + "step": 342, + "time_per_iteration": 2.5015692710876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093959, + "balance_loss_mlp": 1.06565928, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.06741983677161045, + "language_loss": 1.02151966, + "learning_rate": 0.0009966097955400924, + "loss": 1.03245926, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.28320312, + "step": 343, + "time_per_iteration": 2.679197311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108203, + "balance_loss_mlp": 1.054111, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.10243167176705169, + "language_loss": 0.95901835, + "learning_rate": 0.0009965734813013652, + "loss": 0.96983862, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.27954102, + "step": 344, + "time_per_iteration": 2.7926278114318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094237, + "balance_loss_mlp": 1.06638968, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.07573309355987462, + "language_loss": 0.97904384, + "learning_rate": 0.0009965369742757151, + "loss": 0.98998624, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.27856445, + "step": 345, + "time_per_iteration": 2.5709216594696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.06564522, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.07452264052062355, + "language_loss": 0.94766545, + "learning_rate": 0.0009965002744773152, + "loss": 0.95861208, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.28979492, + "step": 346, + "time_per_iteration": 3.500114679336548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110333, + "balance_loss_mlp": 1.0740993, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06770544307121987, + "language_loss": 0.92343372, + "learning_rate": 0.0009964633819204139, + "loss": 0.93446708, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.29223633, + "step": 347, + "time_per_iteration": 2.660534143447876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01495519, + "balance_loss_mlp": 1.47739971, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.07316018638585145, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83296633, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.18164062, + "step": 348, + "time_per_iteration": 4.936125040054321 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01453408, + "balance_loss_mlp": 1.43557465, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.05966333264944154, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76607287, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.17871094, + "step": 349, + "time_per_iteration": 4.916368722915649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121413, + "balance_loss_mlp": 1.09161115, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.09818918049538049, + "language_loss": 0.91932184, + "learning_rate": 0.000996351547842304, + "loss": 0.93053597, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.29760742, + "step": 350, + "time_per_iteration": 3.1482698917388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116118, + "balance_loss_mlp": 1.08686399, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.08574695638310478, + "language_loss": 0.9006294, + "learning_rate": 0.0009963138843953744, + "loss": 0.91179061, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.29223633, + "step": 351, + "time_per_iteration": 2.6286985874176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112662, + "balance_loss_mlp": 1.09572136, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.062103550545623463, + "language_loss": 0.94588864, + "learning_rate": 0.000996276028262306, + "loss": 0.95715487, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.30859375, + "step": 352, + "time_per_iteration": 2.8076047897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118319, + "balance_loss_mlp": 1.08899331, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.08848881047736162, + "language_loss": 1.00543904, + "learning_rate": 0.0009962379794577964, + "loss": 1.01662219, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.29296875, + "step": 353, + "time_per_iteration": 2.6257643699645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126251, + "balance_loss_mlp": 1.09525669, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.07023516682391727, + "language_loss": 0.91387081, + "learning_rate": 0.000996199737996617, + "loss": 0.92513329, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.31005859, + "step": 354, + "time_per_iteration": 2.9115777015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.07862616, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.10590106261560671, + "language_loss": 0.99111325, + "learning_rate": 0.0009961613038936149, + "loss": 1.00219345, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.29345703, + "step": 355, + "time_per_iteration": 2.632269859313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106903, + "balance_loss_mlp": 1.07848334, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06351615461114794, + "language_loss": 0.92452097, + "learning_rate": 0.000996122677163711, + "loss": 0.93559003, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.28417969, + "step": 356, + "time_per_iteration": 2.8401455879211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116364, + "balance_loss_mlp": 1.08880246, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08494375059258584, + "language_loss": 0.98204505, + "learning_rate": 0.000996083857821902, + "loss": 0.99320877, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.27612305, + "step": 357, + "time_per_iteration": 3.0309877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123871, + "balance_loss_mlp": 1.09387815, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.09643576242322613, + "language_loss": 0.95811963, + "learning_rate": 0.0009960448458832588, + "loss": 0.96935833, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.30004883, + "step": 358, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.09053433, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.08018524599206517, + "language_loss": 0.95721531, + "learning_rate": 0.000996005641362927, + "loss": 0.96840835, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.28735352, + "step": 359, + "time_per_iteration": 2.589519739151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125244, + "balance_loss_mlp": 1.09663391, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08939873306910956, + "language_loss": 0.98375708, + "learning_rate": 0.0009959662442761274, + "loss": 0.99500948, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.28613281, + "step": 360, + "time_per_iteration": 2.9202845096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121734, + "balance_loss_mlp": 1.09360027, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.08129648248307358, + "language_loss": 0.92418718, + "learning_rate": 0.000995926654638155, + "loss": 0.93540448, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.28149414, + "step": 361, + "time_per_iteration": 2.807333469390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125246, + "balance_loss_mlp": 1.09706521, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.09207283388165423, + "language_loss": 0.94086993, + "learning_rate": 0.00099588687246438, + "loss": 0.95212233, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.28222656, + "step": 362, + "time_per_iteration": 2.841938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144139, + "balance_loss_mlp": 1.1155293, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.09456174795196681, + "language_loss": 1.01274741, + "learning_rate": 0.0009958468977702471, + "loss": 1.02418876, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.28588867, + "step": 363, + "time_per_iteration": 2.633852958679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01648964, + "balance_loss_mlp": 1.62617075, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.13616610145697036, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81383669, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.22753906, + "step": 364, + "time_per_iteration": 4.863068580627441 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011272, + "balance_loss_mlp": 1.09961534, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.09005148424800312, + "language_loss": 0.90165555, + "learning_rate": 0.0009957663708830612, + "loss": 0.91292757, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.27612305, + "step": 365, + "time_per_iteration": 3.281414031982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123586, + "balance_loss_mlp": 1.09442711, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09334468540758137, + "language_loss": 0.91653895, + "learning_rate": 0.0009957258187212714, + "loss": 0.92777479, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.29174805, + "step": 366, + "time_per_iteration": 3.038696050643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01445219, + "balance_loss_mlp": 1.42652738, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.06427367616648676, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80640084, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.18652344, + "step": 367, + "time_per_iteration": 4.7983925342559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116044, + "balance_loss_mlp": 1.08788657, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.13146714334583684, + "language_loss": 0.89768213, + "learning_rate": 0.0009956441370400167, + "loss": 0.90884256, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.28173828, + "step": 368, + "time_per_iteration": 2.6321308612823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119626, + "balance_loss_mlp": 1.09201741, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.12272393932614807, + "language_loss": 0.9541142, + "learning_rate": 0.0009956030075522636, + "loss": 0.96531045, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.27636719, + "step": 369, + "time_per_iteration": 2.772404909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114007, + "balance_loss_mlp": 1.08685124, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.09366652552108264, + "language_loss": 0.95805156, + "learning_rate": 0.0009955616856543587, + "loss": 0.96919167, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.2722168, + "step": 370, + "time_per_iteration": 2.628877878189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113669, + "balance_loss_mlp": 1.08608413, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.08609469252939483, + "language_loss": 0.88399851, + "learning_rate": 0.0009955201713623448, + "loss": 0.89513522, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.27612305, + "step": 371, + "time_per_iteration": 2.7591450214385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328242, + "balance_loss_mlp": 1.31155288, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.05190160953718325, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78000963, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.16699219, + "step": 372, + "time_per_iteration": 4.995140552520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.07563186, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.13457072532657127, + "language_loss": 1.02136469, + "learning_rate": 0.0009954365656605333, + "loss": 1.03238261, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.26184082, + "step": 373, + "time_per_iteration": 2.56646990776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106565, + "balance_loss_mlp": 1.07979035, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.08663326270818063, + "language_loss": 0.94899744, + "learning_rate": 0.0009953944742831947, + "loss": 0.96006304, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.26831055, + "step": 374, + "time_per_iteration": 2.9695053100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102573, + "balance_loss_mlp": 1.07596529, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.09289035836035217, + "language_loss": 0.97933537, + "learning_rate": 0.0009953521905766642, + "loss": 0.99036103, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.26647949, + "step": 375, + "time_per_iteration": 2.942178249359131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113342, + "balance_loss_mlp": 1.08630502, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.10463311528366259, + "language_loss": 0.97135454, + "learning_rate": 0.0009953097145573577, + "loss": 0.98248798, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.27075195, + "step": 376, + "time_per_iteration": 2.6447842121124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_mlp": 1.08645439, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.10778381820568583, + "language_loss": 0.93408906, + "learning_rate": 0.000995267046241766, + "loss": 0.94524205, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.28808594, + "step": 377, + "time_per_iteration": 3.281200647354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106472, + "balance_loss_mlp": 1.07807684, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.08395054735439604, + "language_loss": 0.93929148, + "learning_rate": 0.0009952241856464547, + "loss": 0.95035625, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.28393555, + "step": 378, + "time_per_iteration": 2.6047444343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011332, + "balance_loss_mlp": 1.10265875, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.10390894184481733, + "language_loss": 0.9941417, + "learning_rate": 0.0009951811327880632, + "loss": 1.00547373, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.30541992, + "step": 379, + "time_per_iteration": 2.726473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.11162257, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.10097597522795056, + "language_loss": 0.93640876, + "learning_rate": 0.0009951378876833063, + "loss": 0.94783425, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.30908203, + "step": 380, + "time_per_iteration": 2.5623717308044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113686, + "balance_loss_mlp": 1.10598469, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.09709945532148136, + "language_loss": 1.0008266, + "learning_rate": 0.0009950944503489736, + "loss": 1.01219511, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.30834961, + "step": 381, + "time_per_iteration": 2.7424862384796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125905, + "balance_loss_mlp": 1.0951966, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.08729931882910318, + "language_loss": 0.94688666, + "learning_rate": 0.0009950508208019285, + "loss": 0.95814574, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.30664062, + "step": 382, + "time_per_iteration": 3.011807441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115566, + "balance_loss_mlp": 1.08612156, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.09192641530722392, + "language_loss": 0.98937929, + "learning_rate": 0.0009950069990591096, + "loss": 1.00053501, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.29418945, + "step": 383, + "time_per_iteration": 2.634744644165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266456, + "balance_loss_mlp": 1.25081599, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.07157218635827683, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77667826, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.15625, + "step": 384, + "time_per_iteration": 4.909826993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123479, + "balance_loss_mlp": 1.093009, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.09152581134979716, + "language_loss": 0.9216727, + "learning_rate": 0.0009949187790542777, + "loss": 0.93290746, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.30419922, + "step": 385, + "time_per_iteration": 2.7053511142730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126669, + "balance_loss_mlp": 1.09605598, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.0847962235917395, + "language_loss": 0.87653643, + "learning_rate": 0.0009948743808265148, + "loss": 0.88780314, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.30566406, + "step": 386, + "time_per_iteration": 2.678089141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138501, + "balance_loss_mlp": 1.10865068, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.08492617281736899, + "language_loss": 0.97336739, + "learning_rate": 0.0009948297904714782, + "loss": 0.98475236, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.29833984, + "step": 387, + "time_per_iteration": 2.7185778617858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146366, + "balance_loss_mlp": 1.11620593, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.07151378861674496, + "language_loss": 0.90523744, + "learning_rate": 0.0009947850080064796, + "loss": 0.91670114, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.30151367, + "step": 388, + "time_per_iteration": 2.799166202545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158036, + "balance_loss_mlp": 1.12689841, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.11664332596196766, + "language_loss": 0.94951898, + "learning_rate": 0.0009947400334489047, + "loss": 0.96109939, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.31103516, + "step": 389, + "time_per_iteration": 3.0231211185455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146122, + "balance_loss_mlp": 1.11646235, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.09913116245985863, + "language_loss": 0.85822582, + "learning_rate": 0.0009946948668162145, + "loss": 0.86968708, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.29638672, + "step": 390, + "time_per_iteration": 2.8080904483795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129912, + "balance_loss_mlp": 1.09910846, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.1060751216039937, + "language_loss": 0.91006148, + "learning_rate": 0.0009946495081259441, + "loss": 0.92136061, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.30786133, + "step": 391, + "time_per_iteration": 2.853335380554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125709, + "balance_loss_mlp": 1.09528649, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.10996734320487103, + "language_loss": 0.93701887, + "learning_rate": 0.0009946039573957035, + "loss": 0.94827592, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.30371094, + "step": 392, + "time_per_iteration": 2.926420211791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107845, + "balance_loss_mlp": 1.07887673, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.10253812696642157, + "language_loss": 0.91059798, + "learning_rate": 0.000994558214643177, + "loss": 0.92167646, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.28979492, + "step": 393, + "time_per_iteration": 2.783536434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103583, + "balance_loss_mlp": 1.07344699, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.08274248346409746, + "language_loss": 0.91916323, + "learning_rate": 0.000994512279886123, + "loss": 0.93019903, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30078125, + "step": 394, + "time_per_iteration": 3.0799474716186523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099135, + "balance_loss_mlp": 1.06902301, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06927054930208885, + "language_loss": 0.93251747, + "learning_rate": 0.0009944661531423758, + "loss": 0.9435088, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.30078125, + "step": 395, + "time_per_iteration": 2.6641883850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103492, + "balance_loss_mlp": 1.07383251, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.09904896099194287, + "language_loss": 0.91404933, + "learning_rate": 0.000994419834429843, + "loss": 0.92508423, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29638672, + "step": 396, + "time_per_iteration": 2.661850690841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114049, + "balance_loss_mlp": 1.08257747, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.10979610845710805, + "language_loss": 0.93416023, + "learning_rate": 0.0009943733237665069, + "loss": 0.94530076, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.31445312, + "step": 397, + "time_per_iteration": 2.854339361190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111663, + "balance_loss_mlp": 1.08561158, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.07380051857889673, + "language_loss": 0.9521122, + "learning_rate": 0.0009943266211704248, + "loss": 0.96327847, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.30981445, + "step": 398, + "time_per_iteration": 2.958059787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.0786798, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09100164928673704, + "language_loss": 0.97291386, + "learning_rate": 0.000994279726659728, + "loss": 0.98401797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31713867, + "step": 399, + "time_per_iteration": 2.5242953300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128703, + "balance_loss_mlp": 1.09413218, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.09258616119375639, + "language_loss": 0.92782032, + "learning_rate": 0.0009942326402526231, + "loss": 0.93910736, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.34594727, + "step": 400, + "time_per_iteration": 2.5207695960998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143337, + "balance_loss_mlp": 1.10955346, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.07710774358121592, + "language_loss": 0.92332727, + "learning_rate": 0.0009941853619673902, + "loss": 0.93476063, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.33789062, + "step": 401, + "time_per_iteration": 2.6304752826690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142379, + "balance_loss_mlp": 1.10947704, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.09709488616354546, + "language_loss": 0.95104444, + "learning_rate": 0.0009941378918223844, + "loss": 0.96246827, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.32885742, + "step": 402, + "time_per_iteration": 3.0903730392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136269, + "balance_loss_mlp": 1.10186553, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.09176808059924663, + "language_loss": 0.88839906, + "learning_rate": 0.0009940902298360354, + "loss": 0.89976174, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34423828, + "step": 403, + "time_per_iteration": 2.7252347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.09478593, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.08094022735558755, + "language_loss": 0.96807957, + "learning_rate": 0.0009940423760268473, + "loss": 0.9793731, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.34619141, + "step": 404, + "time_per_iteration": 2.912560224533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136255, + "balance_loss_mlp": 1.0998956, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.1131644160055788, + "language_loss": 0.90535253, + "learning_rate": 0.0009939943304133982, + "loss": 0.91671515, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.36352539, + "step": 405, + "time_per_iteration": 2.691524028778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128492, + "balance_loss_mlp": 1.09301567, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.0877419108538044, + "language_loss": 0.97356665, + "learning_rate": 0.0009939460930143416, + "loss": 0.9848516, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.35522461, + "step": 406, + "time_per_iteration": 2.624150276184082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130223, + "balance_loss_mlp": 1.09484172, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.0945833964014614, + "language_loss": 0.92588282, + "learning_rate": 0.0009938976638484043, + "loss": 0.93718511, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.35400391, + "step": 407, + "time_per_iteration": 2.943443775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132372, + "balance_loss_mlp": 1.09625125, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.11302097827133319, + "language_loss": 0.90334702, + "learning_rate": 0.0009938490429343887, + "loss": 0.91467071, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.36157227, + "step": 408, + "time_per_iteration": 2.5614538192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156856, + "balance_loss_mlp": 1.11870956, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.08706398753077066, + "language_loss": 0.9151262, + "learning_rate": 0.0009938002302911709, + "loss": 0.92669487, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.38134766, + "step": 409, + "time_per_iteration": 2.7606911659240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185423, + "balance_loss_mlp": 1.14758611, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.11763043112663725, + "language_loss": 0.93195748, + "learning_rate": 0.0009937512259377015, + "loss": 0.94381177, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.37841797, + "step": 410, + "time_per_iteration": 2.664318323135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187972, + "balance_loss_mlp": 1.15199518, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.10450629225071802, + "language_loss": 0.93972069, + "learning_rate": 0.000993702029893006, + "loss": 0.95160043, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.359375, + "step": 411, + "time_per_iteration": 2.78944730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182604, + "balance_loss_mlp": 1.14679348, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.0999267349206771, + "language_loss": 0.93036819, + "learning_rate": 0.0009936526421761838, + "loss": 0.94219422, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.3581543, + "step": 412, + "time_per_iteration": 3.070317268371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138713, + "balance_loss_mlp": 1.1031884, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.103699157973277, + "language_loss": 0.95454085, + "learning_rate": 0.000993603062806409, + "loss": 0.96592796, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.35546875, + "step": 413, + "time_per_iteration": 2.6778509616851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111475, + "balance_loss_mlp": 1.080966, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.1031900517026183, + "language_loss": 0.96687901, + "learning_rate": 0.0009935532918029298, + "loss": 0.97802651, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.33813477, + "step": 414, + "time_per_iteration": 2.598691701889038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115143, + "balance_loss_mlp": 1.08016729, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.10374121868926973, + "language_loss": 0.91896659, + "learning_rate": 0.0009935033291850694, + "loss": 0.93011802, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.35009766, + "step": 415, + "time_per_iteration": 2.6626100540161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136571, + "balance_loss_mlp": 1.10121322, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.1007950470797911, + "language_loss": 0.94399852, + "learning_rate": 0.0009934531749722247, + "loss": 0.95536423, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.35351562, + "step": 416, + "time_per_iteration": 2.6062543392181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161455, + "balance_loss_mlp": 1.12671685, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.14193661609984684, + "language_loss": 0.91743952, + "learning_rate": 0.0009934028291838672, + "loss": 0.92905408, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.34790039, + "step": 417, + "time_per_iteration": 2.7159759998321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170119, + "balance_loss_mlp": 1.134166, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.12060272101738621, + "language_loss": 0.87969685, + "learning_rate": 0.0009933522918395433, + "loss": 0.89139807, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.35961914, + "step": 418, + "time_per_iteration": 2.6525259017944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288605, + "balance_loss_mlp": 1.26361907, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.05680606480361405, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79539704, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.24902344, + "step": 419, + "time_per_iteration": 4.8565216064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147061, + "balance_loss_mlp": 1.11074984, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.12828879348175987, + "language_loss": 1.03302395, + "learning_rate": 0.000993250642561551, + "loss": 1.04449451, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.36279297, + "step": 420, + "time_per_iteration": 2.6118712425231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139372, + "balance_loss_mlp": 1.10284615, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.09279765906948532, + "language_loss": 0.90646845, + "learning_rate": 0.0009931995306673466, + "loss": 0.91786218, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.36499023, + "step": 421, + "time_per_iteration": 2.7097063064575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137947, + "balance_loss_mlp": 1.10170722, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.12264346802799699, + "language_loss": 0.9584164, + "learning_rate": 0.000993148227296103, + "loss": 0.96979594, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.36254883, + "step": 422, + "time_per_iteration": 2.6224865913391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112855, + "balance_loss_mlp": 1.093431, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.09272021371299098, + "language_loss": 0.85445499, + "learning_rate": 0.000993096732467738, + "loss": 0.86574042, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.3515625, + "step": 423, + "time_per_iteration": 2.9733965396881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140481, + "balance_loss_mlp": 1.10407472, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.12206645659912072, + "language_loss": 0.90398526, + "learning_rate": 0.0009930450462022435, + "loss": 0.91539013, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.36376953, + "step": 424, + "time_per_iteration": 2.8079323768615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300116, + "balance_loss_mlp": 1.2751298, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.07506497844528874, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80489922, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.24902344, + "step": 425, + "time_per_iteration": 4.905512809753418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121946, + "balance_loss_mlp": 1.08668423, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10499242287280508, + "language_loss": 0.89529157, + "learning_rate": 0.0009929410994402065, + "loss": 0.90651101, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35327148, + "step": 426, + "time_per_iteration": 3.7398970127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141941, + "balance_loss_mlp": 1.1045804, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.10023640482449404, + "language_loss": 0.93921095, + "learning_rate": 0.0009928888389840196, + "loss": 0.95063031, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3737793, + "step": 427, + "time_per_iteration": 2.71114182472229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119992, + "balance_loss_mlp": 1.08430111, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.11276239209208863, + "language_loss": 0.96473306, + "learning_rate": 0.0009928363871714147, + "loss": 0.97593296, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.35742188, + "step": 428, + "time_per_iteration": 2.719052314758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118917, + "balance_loss_mlp": 1.0826056, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.08720961611908505, + "language_loss": 0.91275012, + "learning_rate": 0.0009927837440227556, + "loss": 0.92393929, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.36303711, + "step": 429, + "time_per_iteration": 2.854044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098875, + "balance_loss_mlp": 1.06418514, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.07075242488451733, + "language_loss": 0.87952864, + "learning_rate": 0.0009927309095584798, + "loss": 0.89051735, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.34692383, + "step": 430, + "time_per_iteration": 2.9898674488067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102101, + "balance_loss_mlp": 1.06748247, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.11797379038125863, + "language_loss": 0.97102249, + "learning_rate": 0.0009926778837991, + "loss": 0.9820435, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.34643555, + "step": 431, + "time_per_iteration": 2.577531099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111279, + "balance_loss_mlp": 1.07582581, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09137951270996447, + "language_loss": 0.95161557, + "learning_rate": 0.000992624666765202, + "loss": 0.96272832, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.35498047, + "step": 432, + "time_per_iteration": 2.841384172439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141665, + "balance_loss_mlp": 1.10540199, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.1226792169188856, + "language_loss": 0.92907685, + "learning_rate": 0.000992571258477447, + "loss": 0.94049346, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.36279297, + "step": 433, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131869, + "balance_loss_mlp": 1.0957005, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.09107414958413955, + "language_loss": 0.88094407, + "learning_rate": 0.0009925176589565695, + "loss": 0.8922627, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.36206055, + "step": 434, + "time_per_iteration": 2.7925446033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112767, + "balance_loss_mlp": 1.09081006, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.12869710653201102, + "language_loss": 0.96048987, + "learning_rate": 0.0009924638682233791, + "loss": 0.97176659, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.36865234, + "step": 435, + "time_per_iteration": 2.578301191329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293384, + "balance_loss_mlp": 1.26963747, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.05787730041443156, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80857974, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.23730469, + "step": 436, + "time_per_iteration": 4.577009201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105613, + "balance_loss_mlp": 1.07092249, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09893423016048233, + "language_loss": 0.86262441, + "learning_rate": 0.0009923557132036668, + "loss": 0.87368047, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.34716797, + "step": 437, + "time_per_iteration": 3.0512332916259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111937, + "balance_loss_mlp": 1.07641208, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.08022134137003532, + "language_loss": 0.92201281, + "learning_rate": 0.0009923013489591345, + "loss": 0.93313217, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.35571289, + "step": 438, + "time_per_iteration": 2.74950909614563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101532, + "balance_loss_mlp": 1.06724763, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.100162941065544, + "language_loss": 0.90520388, + "learning_rate": 0.0009922467935862681, + "loss": 0.91621923, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.34326172, + "step": 439, + "time_per_iteration": 3.0904464721679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117546, + "balance_loss_mlp": 1.08307123, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.0868598025723284, + "language_loss": 0.93269211, + "learning_rate": 0.0009921920471062478, + "loss": 0.94386756, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.34521484, + "step": 440, + "time_per_iteration": 2.5794718265533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129423, + "balance_loss_mlp": 1.09458995, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.08760481485615552, + "language_loss": 0.90004873, + "learning_rate": 0.0009921371095403281, + "loss": 0.91134298, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.34863281, + "step": 441, + "time_per_iteration": 2.6602251529693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146723, + "balance_loss_mlp": 1.11010158, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0774335957746243, + "language_loss": 0.93349928, + "learning_rate": 0.0009920819809098379, + "loss": 0.9449665, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3659668, + "step": 442, + "time_per_iteration": 2.601776123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154219, + "balance_loss_mlp": 1.11693072, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07362842569129122, + "language_loss": 0.88841242, + "learning_rate": 0.0009920266612361798, + "loss": 0.89995468, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.37255859, + "step": 443, + "time_per_iteration": 2.730400800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134029, + "balance_loss_mlp": 1.09712195, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07691784169579122, + "language_loss": 0.90311241, + "learning_rate": 0.0009919711505408308, + "loss": 0.91445279, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.36889648, + "step": 444, + "time_per_iteration": 2.784175395965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136596, + "balance_loss_mlp": 1.0992831, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.10632405925705127, + "language_loss": 0.87768185, + "learning_rate": 0.000991915448845342, + "loss": 0.8890478, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.37329102, + "step": 445, + "time_per_iteration": 2.5208120346069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131693, + "balance_loss_mlp": 1.09581065, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.08773057765175464, + "language_loss": 0.96764338, + "learning_rate": 0.000991859556171339, + "loss": 0.97896028, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.35888672, + "step": 446, + "time_per_iteration": 2.62111759185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121751, + "balance_loss_mlp": 1.08582091, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.09700121256693707, + "language_loss": 0.97393352, + "learning_rate": 0.000991803472540521, + "loss": 0.98515099, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.359375, + "step": 447, + "time_per_iteration": 2.6023707389831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106051, + "balance_loss_mlp": 1.07155204, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.08203891217845936, + "language_loss": 0.9339667, + "learning_rate": 0.0009917471979746615, + "loss": 0.94502723, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.34521484, + "step": 448, + "time_per_iteration": 3.032045841217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108032, + "balance_loss_mlp": 1.07288861, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.07141468257554369, + "language_loss": 0.93266523, + "learning_rate": 0.0009916907324956086, + "loss": 0.94374555, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.35180664, + "step": 449, + "time_per_iteration": 2.7145769596099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124616, + "balance_loss_mlp": 1.08820987, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.07969277456361384, + "language_loss": 0.88546509, + "learning_rate": 0.0009916340761252837, + "loss": 0.89671123, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.36376953, + "step": 450, + "time_per_iteration": 2.623152017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137489, + "balance_loss_mlp": 1.10108209, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.11402885145068274, + "language_loss": 0.86408567, + "learning_rate": 0.0009915772288856832, + "loss": 0.87546057, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.36474609, + "step": 451, + "time_per_iteration": 3.069053888320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137973, + "balance_loss_mlp": 1.10178065, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.09443027615205003, + "language_loss": 0.88496101, + "learning_rate": 0.000991520190798877, + "loss": 0.89634073, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.36206055, + "step": 452, + "time_per_iteration": 2.8196520805358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146235, + "balance_loss_mlp": 1.10906577, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.10286670415776202, + "language_loss": 0.95532084, + "learning_rate": 0.0009914629618870089, + "loss": 0.96678317, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.37158203, + "step": 453, + "time_per_iteration": 2.8787243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247018, + "balance_loss_mlp": 1.22422564, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.049899161357568285, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79922891, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.22753906, + "step": 454, + "time_per_iteration": 4.787290811538696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212398, + "balance_loss_mlp": 1.19036818, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.0324381166824538, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82640362, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.22070312, + "step": 455, + "time_per_iteration": 4.818731784820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120368, + "balance_loss_mlp": 1.08324623, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.09487211541236003, + "language_loss": 0.89355373, + "learning_rate": 0.0009912901304235883, + "loss": 0.90475744, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.37133789, + "step": 456, + "time_per_iteration": 2.8851993083953857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011174, + "balance_loss_mlp": 1.08108902, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.09303414624011808, + "language_loss": 0.85744059, + "learning_rate": 0.000991232138434397, + "loss": 0.86861455, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.36352539, + "step": 457, + "time_per_iteration": 2.8450586795806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118359, + "balance_loss_mlp": 1.08126163, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.11356405017629323, + "language_loss": 0.91543031, + "learning_rate": 0.000991173955731976, + "loss": 0.92661393, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.37084961, + "step": 458, + "time_per_iteration": 2.6324169635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117714, + "balance_loss_mlp": 1.08190393, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.08091220448679284, + "language_loss": 0.98039645, + "learning_rate": 0.0009911155823389137, + "loss": 0.99157357, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.3581543, + "step": 459, + "time_per_iteration": 2.9783670902252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121069, + "balance_loss_mlp": 1.08451915, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0940583187075056, + "language_loss": 0.93095994, + "learning_rate": 0.000991057018277873, + "loss": 0.94217062, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.36499023, + "step": 460, + "time_per_iteration": 2.742830276489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112002, + "balance_loss_mlp": 1.08380461, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10556048763009983, + "language_loss": 0.92411214, + "learning_rate": 0.0009909982635715898, + "loss": 0.93531239, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.36279297, + "step": 461, + "time_per_iteration": 2.613490581512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111287, + "balance_loss_mlp": 1.07595301, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.07908948831956038, + "language_loss": 0.92236221, + "learning_rate": 0.0009909393182428751, + "loss": 0.93347514, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.35351562, + "step": 462, + "time_per_iteration": 2.654144048690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.07331538, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.06646518051532449, + "language_loss": 0.87202108, + "learning_rate": 0.000990880182314614, + "loss": 0.88311398, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.359375, + "step": 463, + "time_per_iteration": 2.705138921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108897, + "balance_loss_mlp": 1.07473207, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.06803924695737752, + "language_loss": 0.88676465, + "learning_rate": 0.0009908208558097643, + "loss": 0.89785367, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.34204102, + "step": 464, + "time_per_iteration": 2.971322536468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120032, + "balance_loss_mlp": 1.08412576, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.15708102336048957, + "language_loss": 0.90012753, + "learning_rate": 0.000990761338751359, + "loss": 0.91132784, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.35913086, + "step": 465, + "time_per_iteration": 2.7719008922576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301625, + "balance_loss_mlp": 1.28073931, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06799997970585842, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74961245, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20898438, + "step": 466, + "time_per_iteration": 4.991540193557739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131475, + "balance_loss_mlp": 1.09637952, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.10779867371948758, + "language_loss": 0.9214865, + "learning_rate": 0.0009906417330663815, + "loss": 0.93280125, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.35131836, + "step": 467, + "time_per_iteration": 2.7089412212371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124394, + "balance_loss_mlp": 1.08917928, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08471126953208015, + "language_loss": 0.88495421, + "learning_rate": 0.0009905816444862442, + "loss": 0.89619815, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.35253906, + "step": 468, + "time_per_iteration": 2.616262435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129147, + "balance_loss_mlp": 1.09371758, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.07702844129808738, + "language_loss": 0.87126988, + "learning_rate": 0.0009905213654454216, + "loss": 0.88256133, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.35473633, + "step": 469, + "time_per_iteration": 2.9097750186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143119, + "balance_loss_mlp": 1.10678387, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09194049655048094, + "language_loss": 0.92914081, + "learning_rate": 0.0009904608959673158, + "loss": 0.9405719, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.36328125, + "step": 470, + "time_per_iteration": 2.8030929565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.10491443, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.10933441897375067, + "language_loss": 0.92262268, + "learning_rate": 0.000990400236075403, + "loss": 0.93404239, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.37036133, + "step": 471, + "time_per_iteration": 2.4859976768493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117092, + "balance_loss_mlp": 1.08183014, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.08808088949589198, + "language_loss": 0.90884256, + "learning_rate": 0.0009903393857932338, + "loss": 0.92001355, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.35302734, + "step": 472, + "time_per_iteration": 2.6540582180023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115458, + "balance_loss_mlp": 1.07933736, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.08261940405294126, + "language_loss": 0.88272375, + "learning_rate": 0.0009902783451444317, + "loss": 0.89387828, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.36108398, + "step": 473, + "time_per_iteration": 2.7061197757720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_mlp": 1.0812211, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.11656166861680099, + "language_loss": 0.93563545, + "learning_rate": 0.0009902171141526956, + "loss": 0.94679749, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.34960938, + "step": 474, + "time_per_iteration": 2.524653911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111201, + "balance_loss_mlp": 1.0760566, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.07692578036886621, + "language_loss": 0.81933677, + "learning_rate": 0.000990155692841797, + "loss": 0.83045685, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.35961914, + "step": 475, + "time_per_iteration": 2.9645543098449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106767, + "balance_loss_mlp": 1.07281613, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.08052092373184025, + "language_loss": 0.93009984, + "learning_rate": 0.0009900940812355818, + "loss": 0.94116753, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.33959961, + "step": 476, + "time_per_iteration": 2.8816893100738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107557, + "balance_loss_mlp": 1.07289076, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.14442514829584613, + "language_loss": 0.87309504, + "learning_rate": 0.00099003227935797, + "loss": 0.88417065, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.34716797, + "step": 477, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122629, + "balance_loss_mlp": 1.08827257, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.12539398809889843, + "language_loss": 0.9113583, + "learning_rate": 0.000989970287232955, + "loss": 0.92258459, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.34399414, + "step": 478, + "time_per_iteration": 2.826150894165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119533, + "balance_loss_mlp": 1.08720374, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.06731886459053077, + "language_loss": 0.89701962, + "learning_rate": 0.0009899081048846043, + "loss": 0.90821493, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.32324219, + "step": 479, + "time_per_iteration": 2.580028772354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143549, + "balance_loss_mlp": 1.1092639, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1155425244176876, + "language_loss": 0.9372611, + "learning_rate": 0.0009898457323370593, + "loss": 0.94869661, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.34301758, + "step": 480, + "time_per_iteration": 2.6090288162231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.10132647, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.08946460297910715, + "language_loss": 0.92488086, + "learning_rate": 0.000989783169614535, + "loss": 0.93623364, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.33984375, + "step": 481, + "time_per_iteration": 2.6434848308563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130787, + "balance_loss_mlp": 1.28212094, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.06384431456169105, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80060625, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.2578125, + "step": 482, + "time_per_iteration": 4.903714656829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120119, + "balance_loss_mlp": 1.08695483, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.0974321715773629, + "language_loss": 0.90389109, + "learning_rate": 0.000989657473741779, + "loss": 0.91509223, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.33178711, + "step": 483, + "time_per_iteration": 2.841749668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132007, + "balance_loss_mlp": 1.09858036, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.07196755449742197, + "language_loss": 0.91361248, + "learning_rate": 0.0009895943406403465, + "loss": 0.9249326, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.33447266, + "step": 484, + "time_per_iteration": 2.728733539581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146209, + "balance_loss_mlp": 1.11137581, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10097789553078372, + "language_loss": 0.84299308, + "learning_rate": 0.0009895310174615338, + "loss": 0.85445517, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.34863281, + "step": 485, + "time_per_iteration": 2.74460506439209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214233, + "balance_loss_mlp": 1.19239426, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04007792490845654, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76932752, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.21875, + "step": 486, + "time_per_iteration": 4.653090715408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135045, + "balance_loss_mlp": 1.10161829, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07938978312310574, + "language_loss": 0.89514428, + "learning_rate": 0.0009894038009701782, + "loss": 0.90649474, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.33447266, + "step": 487, + "time_per_iteration": 2.6534616947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145632, + "balance_loss_mlp": 1.1106087, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.09344776572677456, + "language_loss": 0.87733328, + "learning_rate": 0.0009893399077070253, + "loss": 0.88878953, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.35083008, + "step": 488, + "time_per_iteration": 2.5616586208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130599, + "balance_loss_mlp": 1.09702933, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.08887912188605798, + "language_loss": 0.87485397, + "learning_rate": 0.0009892758244652718, + "loss": 0.8861599, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.3359375, + "step": 489, + "time_per_iteration": 2.6878652572631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.08078194, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.08770205653150476, + "language_loss": 0.91117108, + "learning_rate": 0.0009892115512697968, + "loss": 0.92231989, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.34130859, + "step": 490, + "time_per_iteration": 2.67647123336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113824, + "balance_loss_mlp": 1.0808506, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.06826247830552083, + "language_loss": 0.94586283, + "learning_rate": 0.0009891470881455537, + "loss": 0.95700109, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.32983398, + "step": 491, + "time_per_iteration": 2.7388105392456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109252, + "balance_loss_mlp": 1.07627821, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.08083030362482532, + "language_loss": 0.90903842, + "learning_rate": 0.0009890824351175692, + "loss": 0.92013097, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.32983398, + "step": 492, + "time_per_iteration": 2.710557222366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108934, + "balance_loss_mlp": 1.07586551, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07986708443523517, + "language_loss": 0.96040058, + "learning_rate": 0.0009890175922109435, + "loss": 0.97148991, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.33081055, + "step": 493, + "time_per_iteration": 2.748145341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119937, + "balance_loss_mlp": 1.08686852, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.1003982234968368, + "language_loss": 0.93827844, + "learning_rate": 0.0009889525594508513, + "loss": 0.94947779, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33081055, + "step": 494, + "time_per_iteration": 2.9940547943115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113416, + "balance_loss_mlp": 1.08037138, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.06206488721584602, + "language_loss": 0.88783181, + "learning_rate": 0.0009888873368625404, + "loss": 0.89896601, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.33056641, + "step": 495, + "time_per_iteration": 2.5122387409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129, + "balance_loss_mlp": 1.09557426, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08099902604416225, + "language_loss": 0.9180485, + "learning_rate": 0.0009888219244713326, + "loss": 0.92933846, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.33447266, + "step": 496, + "time_per_iteration": 2.8516368865966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145932, + "balance_loss_mlp": 1.11152768, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.09295440988952328, + "language_loss": 0.91113585, + "learning_rate": 0.0009887563223026229, + "loss": 0.92259514, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.34423828, + "step": 497, + "time_per_iteration": 2.7165610790252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226975, + "balance_loss_mlp": 1.20780587, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04473280554485948, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80295134, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.19140625, + "step": 498, + "time_per_iteration": 4.9133830070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.12261629, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.0716278208231272, + "language_loss": 0.91129965, + "learning_rate": 0.0009886245487346482, + "loss": 0.92288053, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.35522461, + "step": 499, + "time_per_iteration": 3.074453353881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151408, + "balance_loss_mlp": 1.1164794, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09258819117654143, + "language_loss": 0.93041325, + "learning_rate": 0.0009885583773865422, + "loss": 0.94192737, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.34912109, + "step": 500, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129637, + "balance_loss_mlp": 1.09482849, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08421486249996342, + "language_loss": 0.90840685, + "learning_rate": 0.0009884920163632524, + "loss": 0.9197033, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.34814453, + "step": 501, + "time_per_iteration": 2.653083324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133899, + "balance_loss_mlp": 1.09875655, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08831216016047307, + "language_loss": 0.92406952, + "learning_rate": 0.000988425465690543, + "loss": 0.93540847, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.35180664, + "step": 502, + "time_per_iteration": 2.5902318954467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129227, + "balance_loss_mlp": 1.09363079, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.08884204924947281, + "language_loss": 0.89819443, + "learning_rate": 0.0009883587253942505, + "loss": 0.90948665, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.35595703, + "step": 503, + "time_per_iteration": 2.7927231788635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134871, + "balance_loss_mlp": 1.09956098, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.08422879575374595, + "language_loss": 0.96091402, + "learning_rate": 0.0009882917955002862, + "loss": 0.97226262, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.35302734, + "step": 504, + "time_per_iteration": 2.538280963897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117737, + "balance_loss_mlp": 1.08297515, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07639016770494517, + "language_loss": 0.89420688, + "learning_rate": 0.0009882246760346343, + "loss": 0.9053843, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.34790039, + "step": 505, + "time_per_iteration": 2.6242942810058594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124058, + "balance_loss_mlp": 1.08834267, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.11518068103281653, + "language_loss": 0.92468822, + "learning_rate": 0.0009881573670233533, + "loss": 0.93592882, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.35742188, + "step": 506, + "time_per_iteration": 2.516587734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114247, + "balance_loss_mlp": 1.08074903, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.07574597822432369, + "language_loss": 0.8811729, + "learning_rate": 0.0009880898684925747, + "loss": 0.89231527, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.33520508, + "step": 507, + "time_per_iteration": 2.693880081176758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107378, + "balance_loss_mlp": 1.07402313, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07603441014422499, + "language_loss": 0.86951101, + "learning_rate": 0.0009880221804685037, + "loss": 0.88058472, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.33374023, + "step": 508, + "time_per_iteration": 2.5847270488739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01468428, + "balance_loss_mlp": 1.44983101, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.12348847609036423, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80812848, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18554688, + "step": 509, + "time_per_iteration": 4.756307601928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123963, + "balance_loss_mlp": 1.09103727, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08757433726580034, + "language_loss": 0.93106389, + "learning_rate": 0.0009878862360456733, + "loss": 0.9423036, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.32910156, + "step": 510, + "time_per_iteration": 2.6813509464263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110714, + "balance_loss_mlp": 1.07759809, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.08240718915912659, + "language_loss": 0.86918676, + "learning_rate": 0.0009878179796996922, + "loss": 0.88029397, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.33129883, + "step": 511, + "time_per_iteration": 2.7128310203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.08112836, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.07802243599022093, + "language_loss": 0.90101254, + "learning_rate": 0.0009877495339659754, + "loss": 0.91214454, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.32055664, + "step": 512, + "time_per_iteration": 2.8097684383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102205, + "balance_loss_mlp": 1.07035255, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.09144065810451378, + "language_loss": 0.850245, + "learning_rate": 0.000987680898871096, + "loss": 0.86126709, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.31835938, + "step": 513, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108259, + "balance_loss_mlp": 1.07502341, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.10540688433367246, + "language_loss": 0.85520494, + "learning_rate": 0.0009876120744417, + "loss": 0.86628759, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.33251953, + "step": 514, + "time_per_iteration": 2.9515652656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101037, + "balance_loss_mlp": 1.06818295, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.09508855922632749, + "language_loss": 0.93521011, + "learning_rate": 0.0009875430607045078, + "loss": 0.94622052, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.32861328, + "step": 515, + "time_per_iteration": 2.7193381786346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109471, + "balance_loss_mlp": 1.06164145, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.07449645219133615, + "language_loss": 0.90591514, + "learning_rate": 0.000987473857686313, + "loss": 0.91686225, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.33081055, + "step": 516, + "time_per_iteration": 2.7179975509643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115382, + "balance_loss_mlp": 1.08100188, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.10856360121839106, + "language_loss": 0.92182052, + "learning_rate": 0.0009874044654139824, + "loss": 0.9329744, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.7596991062164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135092, + "balance_loss_mlp": 1.10104585, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.10414801938878855, + "language_loss": 0.9130857, + "learning_rate": 0.0009873348839144563, + "loss": 0.92443669, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.34082031, + "step": 518, + "time_per_iteration": 2.5228705406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117267, + "balance_loss_mlp": 1.1381228, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.09626367264756285, + "language_loss": 0.94683075, + "learning_rate": 0.000987265113214749, + "loss": 0.95855749, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.34545898, + "step": 519, + "time_per_iteration": 2.5458812713623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118966, + "balance_loss_mlp": 1.15339625, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.12320854939875277, + "language_loss": 0.94298297, + "learning_rate": 0.0009871951533419476, + "loss": 0.95487958, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.36279297, + "step": 520, + "time_per_iteration": 2.663461208343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156318, + "balance_loss_mlp": 1.12010193, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.08720896475780489, + "language_loss": 0.86881042, + "learning_rate": 0.0009871250043232132, + "loss": 0.8803736, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.36206055, + "step": 521, + "time_per_iteration": 2.7820796966552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140376, + "balance_loss_mlp": 1.1049943, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.08876661910472074, + "language_loss": 0.85204661, + "learning_rate": 0.0009870546661857797, + "loss": 0.86345041, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.35375977, + "step": 522, + "time_per_iteration": 2.634274482727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152268, + "balance_loss_mlp": 1.11583781, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08623162465623763, + "language_loss": 0.92886114, + "learning_rate": 0.0009869841389569553, + "loss": 0.94038385, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.36401367, + "step": 523, + "time_per_iteration": 3.0027353763580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151068, + "balance_loss_mlp": 1.11571026, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.07820731611640971, + "language_loss": 0.86882633, + "learning_rate": 0.0009869134226641206, + "loss": 0.880337, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.35424805, + "step": 524, + "time_per_iteration": 2.5850446224212646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159546, + "balance_loss_mlp": 1.12330627, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.07931950894681525, + "language_loss": 0.86448371, + "learning_rate": 0.0009868425173347303, + "loss": 0.8760792, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.36254883, + "step": 525, + "time_per_iteration": 2.6873726844787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171885, + "balance_loss_mlp": 1.13617015, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.09671662269899156, + "language_loss": 0.94872439, + "learning_rate": 0.0009867714229963125, + "loss": 0.96044326, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.35717773, + "step": 526, + "time_per_iteration": 2.697547197341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155786, + "balance_loss_mlp": 1.12083411, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.10324452979849556, + "language_loss": 0.9236598, + "learning_rate": 0.000986700139676468, + "loss": 0.93521762, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.34960938, + "step": 527, + "time_per_iteration": 2.5702626705169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171342, + "balance_loss_mlp": 1.1346494, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.08227699709590157, + "language_loss": 0.89510548, + "learning_rate": 0.0009866286674028717, + "loss": 0.90681893, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.36694336, + "step": 528, + "time_per_iteration": 2.699542284011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141961, + "balance_loss_mlp": 1.1081537, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.0843490367773928, + "language_loss": 0.8638742, + "learning_rate": 0.0009865570062032717, + "loss": 0.87529385, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.33837891, + "step": 529, + "time_per_iteration": 2.941728353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114882, + "balance_loss_mlp": 1.11420166, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.07671472850746988, + "language_loss": 0.9148134, + "learning_rate": 0.0009864851561054893, + "loss": 0.9263016, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.34643555, + "step": 530, + "time_per_iteration": 2.7894959449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147452, + "balance_loss_mlp": 1.1134541, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.08702044825545475, + "language_loss": 0.90471494, + "learning_rate": 0.0009864131171374191, + "loss": 0.91618943, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.34033203, + "step": 531, + "time_per_iteration": 2.6681158542633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144724, + "balance_loss_mlp": 1.11139297, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.0664826941787488, + "language_loss": 0.89538574, + "learning_rate": 0.0009863408893270292, + "loss": 0.90683293, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.33349609, + "step": 532, + "time_per_iteration": 2.7965428829193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129812, + "balance_loss_mlp": 1.09576535, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08878024025613328, + "language_loss": 0.84706688, + "learning_rate": 0.0009862684727023605, + "loss": 0.858365, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.34082031, + "step": 533, + "time_per_iteration": 2.7238268852233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116987, + "balance_loss_mlp": 1.08453798, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.1682383439962665, + "language_loss": 0.87668955, + "learning_rate": 0.0009861958672915283, + "loss": 0.8878594, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.32446289, + "step": 534, + "time_per_iteration": 2.7945988178253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096267, + "balance_loss_mlp": 1.06415248, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.0654465541126679, + "language_loss": 0.88598454, + "learning_rate": 0.0009861230731227201, + "loss": 0.89694726, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.32104492, + "step": 535, + "time_per_iteration": 2.8504462242126465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094849, + "balance_loss_mlp": 1.06180418, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.09703481929017231, + "language_loss": 0.90092826, + "learning_rate": 0.0009860500902241973, + "loss": 0.91187674, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.33056641, + "step": 536, + "time_per_iteration": 2.6230618953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093921, + "balance_loss_mlp": 1.06028032, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.07541190921269121, + "language_loss": 0.94890571, + "learning_rate": 0.0009859769186242942, + "loss": 0.95984495, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.33642578, + "step": 537, + "time_per_iteration": 2.5023155212402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090493, + "balance_loss_mlp": 1.05802083, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.08038513642950565, + "language_loss": 0.87629044, + "learning_rate": 0.0009859035583514187, + "loss": 0.88719535, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.32470703, + "step": 538, + "time_per_iteration": 2.617408514022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102021, + "balance_loss_mlp": 1.06885695, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.08463096218018039, + "language_loss": 0.88947332, + "learning_rate": 0.0009858300094340517, + "loss": 0.9004935, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.33178711, + "step": 539, + "time_per_iteration": 2.7788918018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102321, + "balance_loss_mlp": 1.06989646, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08363201697238119, + "language_loss": 0.84166092, + "learning_rate": 0.0009857562719007473, + "loss": 0.85268414, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32421875, + "step": 540, + "time_per_iteration": 2.6021273136138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106344, + "balance_loss_mlp": 1.07349014, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07699058030721453, + "language_loss": 0.86313522, + "learning_rate": 0.0009856823457801331, + "loss": 0.87419868, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.32861328, + "step": 541, + "time_per_iteration": 2.898247003555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121037, + "balance_loss_mlp": 1.0881114, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.09427475874312204, + "language_loss": 0.92884254, + "learning_rate": 0.00098560823110091, + "loss": 0.94005299, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.3293457, + "step": 542, + "time_per_iteration": 2.628246784210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117484, + "balance_loss_mlp": 1.08441556, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.09038961872332987, + "language_loss": 0.93836176, + "learning_rate": 0.000985533927891851, + "loss": 0.94953668, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.33081055, + "step": 543, + "time_per_iteration": 2.6802377700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104521, + "balance_loss_mlp": 1.07114232, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.07979198382497373, + "language_loss": 0.91847962, + "learning_rate": 0.0009854594361818044, + "loss": 0.9295249, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.33398438, + "step": 544, + "time_per_iteration": 2.6934244632720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097218, + "balance_loss_mlp": 1.06372046, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.070981397623147, + "language_loss": 0.91175914, + "learning_rate": 0.0009853847559996897, + "loss": 0.92273128, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.33520508, + "step": 545, + "time_per_iteration": 2.7615010738372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121637, + "balance_loss_mlp": 1.08713746, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.07225830349373973, + "language_loss": 0.90024251, + "learning_rate": 0.0009853098873745, + "loss": 0.91145885, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.34545898, + "step": 546, + "time_per_iteration": 2.995853900909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128427, + "balance_loss_mlp": 1.09407067, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.08430865527250554, + "language_loss": 0.89361405, + "learning_rate": 0.0009852348303353027, + "loss": 0.90489835, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.34399414, + "step": 547, + "time_per_iteration": 2.7888100147247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141582, + "balance_loss_mlp": 1.106511, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07123259169118071, + "language_loss": 0.82929194, + "learning_rate": 0.000985159584911237, + "loss": 0.84070778, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.35107422, + "step": 548, + "time_per_iteration": 3.11181902885437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141867, + "balance_loss_mlp": 1.10658062, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.1040806422735416, + "language_loss": 0.89825702, + "learning_rate": 0.0009850841511315162, + "loss": 0.90967572, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.35327148, + "step": 549, + "time_per_iteration": 2.638000726699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130362, + "balance_loss_mlp": 1.09493339, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.07056487851665215, + "language_loss": 0.9078036, + "learning_rate": 0.0009850085290254256, + "loss": 0.9191072, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.35424805, + "step": 550, + "time_per_iteration": 2.774028778076172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117666, + "balance_loss_mlp": 1.08273757, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.06745406591759516, + "language_loss": 0.87385082, + "learning_rate": 0.0009849327186223246, + "loss": 0.88502753, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.34936523, + "step": 551, + "time_per_iteration": 2.7669272422790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110145, + "balance_loss_mlp": 1.06845236, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.0691737715515626, + "language_loss": 0.94504517, + "learning_rate": 0.000984856719951646, + "loss": 0.95605963, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.33007812, + "step": 552, + "time_per_iteration": 2.5428550243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111854, + "balance_loss_mlp": 1.07747412, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.09712099675981889, + "language_loss": 0.91101605, + "learning_rate": 0.0009847805330428943, + "loss": 0.92213452, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.34399414, + "step": 553, + "time_per_iteration": 2.9055614471435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122894, + "balance_loss_mlp": 1.08846664, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.09294887941398464, + "language_loss": 0.92195344, + "learning_rate": 0.0009847041579256481, + "loss": 0.93318236, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34448242, + "step": 554, + "time_per_iteration": 2.5995588302612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124167, + "balance_loss_mlp": 1.08859539, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.08058010800108027, + "language_loss": 0.94049567, + "learning_rate": 0.0009846275946295592, + "loss": 0.9517374, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.35595703, + "step": 555, + "time_per_iteration": 2.6564345359802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114817, + "balance_loss_mlp": 1.07919669, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06398894491712905, + "language_loss": 0.86843902, + "learning_rate": 0.0009845508431843518, + "loss": 0.87958717, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.35620117, + "step": 556, + "time_per_iteration": 3.0014877319335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112252, + "balance_loss_mlp": 1.07675159, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06905237280169106, + "language_loss": 0.87712479, + "learning_rate": 0.0009844739036198233, + "loss": 0.88824731, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.35522461, + "step": 557, + "time_per_iteration": 2.6663765907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126883, + "balance_loss_mlp": 1.09026217, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.08117667522677224, + "language_loss": 0.94649851, + "learning_rate": 0.0009843967759658448, + "loss": 0.95776731, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.36621094, + "step": 558, + "time_per_iteration": 2.6776351928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325803, + "balance_loss_mlp": 1.29795551, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.07702272040631068, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74093556, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.27929688, + "step": 559, + "time_per_iteration": 4.862372398376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112585, + "balance_loss_mlp": 1.08906162, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.07411063690195181, + "language_loss": 0.94592023, + "learning_rate": 0.000984241956509384, + "loss": 0.95717871, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.36767578, + "step": 560, + "time_per_iteration": 2.6602537631988525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152944, + "balance_loss_mlp": 1.11455846, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08630165838839422, + "language_loss": 0.89956963, + "learning_rate": 0.0009841642647670078, + "loss": 0.91109908, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.38378906, + "step": 561, + "time_per_iteration": 2.5539767742156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153249, + "balance_loss_mlp": 1.11433935, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.09499730641116207, + "language_loss": 0.84606594, + "learning_rate": 0.0009840863850553944, + "loss": 0.85759842, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.38867188, + "step": 562, + "time_per_iteration": 2.972862720489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139333, + "balance_loss_mlp": 1.10261655, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.08740431235801023, + "language_loss": 0.90812922, + "learning_rate": 0.0009840083174047782, + "loss": 0.91952258, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3671875, + "step": 563, + "time_per_iteration": 2.728081464767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133161, + "balance_loss_mlp": 1.09739876, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.09202985623691126, + "language_loss": 0.85552108, + "learning_rate": 0.0009839300618454685, + "loss": 0.8668527, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.35791016, + "step": 564, + "time_per_iteration": 2.833817958831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130452, + "balance_loss_mlp": 1.09538078, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06834466327041812, + "language_loss": 0.90596354, + "learning_rate": 0.0009838516184078466, + "loss": 0.91726804, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.35131836, + "step": 565, + "time_per_iteration": 2.8160781860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154605, + "balance_loss_mlp": 1.1185081, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.07188227567019471, + "language_loss": 0.87634718, + "learning_rate": 0.0009837729871223669, + "loss": 0.88789332, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.36083984, + "step": 566, + "time_per_iteration": 2.62117600440979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177195, + "balance_loss_mlp": 1.1406219, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.08533641778088655, + "language_loss": 0.88115579, + "learning_rate": 0.0009836941680195568, + "loss": 0.89292771, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.36547852, + "step": 567, + "time_per_iteration": 2.828911542892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165998, + "balance_loss_mlp": 1.12994933, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.08003102464580239, + "language_loss": 0.83622086, + "learning_rate": 0.0009836151611300166, + "loss": 0.84788084, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.3605957, + "step": 568, + "time_per_iteration": 3.2273471355438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114699, + "balance_loss_mlp": 1.11177564, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.13762061821089808, + "language_loss": 0.94344527, + "learning_rate": 0.0009835359664844194, + "loss": 0.95491517, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.35253906, + "step": 569, + "time_per_iteration": 2.61690616607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01424326, + "balance_loss_mlp": 1.39514339, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.09677893451051751, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82461131, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.29101562, + "step": 570, + "time_per_iteration": 4.929012298583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129446, + "balance_loss_mlp": 1.09449339, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.10645850756285262, + "language_loss": 0.9142105, + "learning_rate": 0.0009833770140481118, + "loss": 0.92550498, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.34985352, + "step": 571, + "time_per_iteration": 2.6662757396698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122373, + "balance_loss_mlp": 1.08689654, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.12031633973381815, + "language_loss": 0.82440388, + "learning_rate": 0.000983297256319112, + "loss": 0.83562756, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.35522461, + "step": 572, + "time_per_iteration": 3.218076467514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134932, + "balance_loss_mlp": 1.09850204, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.08427819288291502, + "language_loss": 0.86899912, + "learning_rate": 0.000983217310957477, + "loss": 0.88034844, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.36425781, + "step": 573, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144342, + "balance_loss_mlp": 1.10803151, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.06509507329480971, + "language_loss": 0.90168923, + "learning_rate": 0.000983137177994244, + "loss": 0.91313267, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.36352539, + "step": 574, + "time_per_iteration": 2.872412919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137496, + "balance_loss_mlp": 1.10221016, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.06653120926816534, + "language_loss": 0.85785711, + "learning_rate": 0.0009830568574605235, + "loss": 0.86923206, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.35302734, + "step": 575, + "time_per_iteration": 2.923383951187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145913, + "balance_loss_mlp": 1.10984039, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.0865486301410286, + "language_loss": 0.87525302, + "learning_rate": 0.0009829763493874992, + "loss": 0.88671219, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.36083984, + "step": 576, + "time_per_iteration": 3.032942056655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134136, + "balance_loss_mlp": 1.09753847, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.08630194081372794, + "language_loss": 0.93183506, + "learning_rate": 0.0009828956538064264, + "loss": 0.94317639, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.36621094, + "step": 577, + "time_per_iteration": 2.8152406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125176, + "balance_loss_mlp": 1.0888648, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.07101537919866721, + "language_loss": 0.90824157, + "learning_rate": 0.0009828147707486344, + "loss": 0.91949332, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.36328125, + "step": 578, + "time_per_iteration": 2.724550485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118046, + "balance_loss_mlp": 1.08209252, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.08130034202286071, + "language_loss": 0.86348194, + "learning_rate": 0.0009827337002455245, + "loss": 0.8746624, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.35961914, + "step": 579, + "time_per_iteration": 2.652369976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111199, + "balance_loss_mlp": 1.07579851, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.06366605788409145, + "language_loss": 0.88115346, + "learning_rate": 0.0009826524423285712, + "loss": 0.89227337, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.36181641, + "step": 580, + "time_per_iteration": 2.947925567626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108192, + "balance_loss_mlp": 1.07192874, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.08930617061108917, + "language_loss": 0.88938302, + "learning_rate": 0.0009825709970293218, + "loss": 0.90046495, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.36303711, + "step": 581, + "time_per_iteration": 2.8744056224823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103655, + "balance_loss_mlp": 1.06731987, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07222891797599594, + "language_loss": 0.95056951, + "learning_rate": 0.0009824893643793956, + "loss": 0.96160614, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.36328125, + "step": 582, + "time_per_iteration": 3.051945209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104834, + "balance_loss_mlp": 1.06811786, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.0803498647914251, + "language_loss": 0.88078201, + "learning_rate": 0.0009824075444104857, + "loss": 0.89183033, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.3671875, + "step": 583, + "time_per_iteration": 2.6833813190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111764, + "balance_loss_mlp": 1.07507193, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.08148632832875594, + "language_loss": 0.93207705, + "learning_rate": 0.000982325537154357, + "loss": 0.94319463, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.36694336, + "step": 584, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113364, + "balance_loss_mlp": 1.07574129, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.08313203670373176, + "language_loss": 0.93823397, + "learning_rate": 0.0009822433426428484, + "loss": 0.94936764, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.37597656, + "step": 585, + "time_per_iteration": 2.568070888519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113305, + "balance_loss_mlp": 1.07594514, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.07694998173228458, + "language_loss": 0.86627567, + "learning_rate": 0.0009821609609078697, + "loss": 0.87740874, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.37304688, + "step": 586, + "time_per_iteration": 2.658702850341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103349, + "balance_loss_mlp": 1.06775331, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.10421690738013599, + "language_loss": 0.89634144, + "learning_rate": 0.0009820783919814045, + "loss": 0.90737498, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.35620117, + "step": 587, + "time_per_iteration": 2.803866386413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109643, + "balance_loss_mlp": 1.07295036, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.07979925286699333, + "language_loss": 0.82699567, + "learning_rate": 0.0009819956358955095, + "loss": 0.83809209, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.36669922, + "step": 588, + "time_per_iteration": 2.5929653644561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110362, + "balance_loss_mlp": 1.07433677, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.07216149622243874, + "language_loss": 0.83354205, + "learning_rate": 0.0009819126926823127, + "loss": 0.84464574, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.36035156, + "step": 589, + "time_per_iteration": 2.5229685306549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122798, + "balance_loss_mlp": 1.08658195, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.08255396626581768, + "language_loss": 0.86631322, + "learning_rate": 0.000981829562374016, + "loss": 0.87754118, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.36279297, + "step": 590, + "time_per_iteration": 2.7939858436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124668, + "balance_loss_mlp": 1.08804727, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.07763031144810686, + "language_loss": 0.97565413, + "learning_rate": 0.0009817462450028933, + "loss": 0.98690081, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.3659668, + "step": 591, + "time_per_iteration": 2.651886224746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115476, + "balance_loss_mlp": 1.07918823, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.0679599519530346, + "language_loss": 0.85396111, + "learning_rate": 0.0009816627406012916, + "loss": 0.86511576, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.36303711, + "step": 592, + "time_per_iteration": 2.8203041553497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117009, + "balance_loss_mlp": 1.08079314, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.07941270182617734, + "language_loss": 0.84330916, + "learning_rate": 0.0009815790492016295, + "loss": 0.85447925, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.36254883, + "step": 593, + "time_per_iteration": 2.952115058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111293, + "balance_loss_mlp": 1.07529223, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.08575724683449225, + "language_loss": 0.86948562, + "learning_rate": 0.0009814951708363993, + "loss": 0.88059855, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.35986328, + "step": 594, + "time_per_iteration": 2.851818084716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259601, + "balance_loss_mlp": 1.23633182, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04120161092279284, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79250586, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.23242188, + "step": 595, + "time_per_iteration": 4.775157928466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107308, + "balance_loss_mlp": 1.07159305, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.06441778711855077, + "language_loss": 0.87857854, + "learning_rate": 0.0009813268533395648, + "loss": 0.8896516, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.35717773, + "step": 596, + "time_per_iteration": 2.5812032222747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117343, + "balance_loss_mlp": 1.08096087, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07680000680618568, + "language_loss": 0.87010378, + "learning_rate": 0.0009812424142733073, + "loss": 0.8812772, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.36401367, + "step": 597, + "time_per_iteration": 2.5546822547912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.07212269, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05681390422854521, + "language_loss": 0.8607024, + "learning_rate": 0.000981157788372175, + "loss": 0.87178314, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.35961914, + "step": 598, + "time_per_iteration": 3.0337140560150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111428, + "balance_loss_mlp": 1.07851696, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.06941688855783729, + "language_loss": 0.89018178, + "learning_rate": 0.0009810729756690223, + "loss": 0.90132457, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.35791016, + "step": 599, + "time_per_iteration": 2.7217423915863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105745, + "balance_loss_mlp": 1.06981504, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.06146114558588388, + "language_loss": 0.91738331, + "learning_rate": 0.0009809879761967766, + "loss": 0.92844075, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.35961914, + "step": 600, + "time_per_iteration": 2.9604732990264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111848, + "balance_loss_mlp": 1.08178735, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.09570347165582511, + "language_loss": 0.86368775, + "learning_rate": 0.0009809027899884378, + "loss": 0.87487245, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36669922, + "step": 601, + "time_per_iteration": 2.9237759113311768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114897, + "balance_loss_mlp": 1.07787061, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.05752007897304988, + "language_loss": 0.88791043, + "learning_rate": 0.0009808174170770779, + "loss": 0.89905941, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.37036133, + "step": 602, + "time_per_iteration": 2.8171939849853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192093, + "balance_loss_mlp": 1.1680603, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.017614530082332158, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86090338, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.24023438, + "step": 603, + "time_per_iteration": 4.935450077056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109032, + "balance_loss_mlp": 1.07360268, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08737735767926022, + "language_loss": 0.93595141, + "learning_rate": 0.0009806461112779462, + "loss": 0.94704169, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.35449219, + "step": 604, + "time_per_iteration": 2.644521951675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110331, + "balance_loss_mlp": 1.07454431, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09922875403821595, + "language_loss": 0.8811909, + "learning_rate": 0.0009805601784566814, + "loss": 0.89229423, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.3581543, + "step": 605, + "time_per_iteration": 2.4736506938934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107193, + "balance_loss_mlp": 1.07209802, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.08013857685507157, + "language_loss": 0.95075512, + "learning_rate": 0.0009804740590654089, + "loss": 0.9618271, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.35131836, + "step": 606, + "time_per_iteration": 2.665424346923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121356, + "balance_loss_mlp": 1.08540201, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.09308217257663119, + "language_loss": 0.89792109, + "learning_rate": 0.0009803877531375635, + "loss": 0.90913463, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.359375, + "step": 607, + "time_per_iteration": 2.854362964630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123257, + "balance_loss_mlp": 1.08725595, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.12019278373574431, + "language_loss": 0.90837669, + "learning_rate": 0.0009803012607066523, + "loss": 0.91960925, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.36035156, + "step": 608, + "time_per_iteration": 2.7351131439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132428, + "balance_loss_mlp": 1.0963558, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06325710240785508, + "language_loss": 0.89651906, + "learning_rate": 0.0009802145818062543, + "loss": 0.90784335, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.36083984, + "step": 609, + "time_per_iteration": 2.706399440765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126801, + "balance_loss_mlp": 1.09060943, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.08665503616765245, + "language_loss": 0.91646838, + "learning_rate": 0.0009801277164700212, + "loss": 0.9277364, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.36230469, + "step": 610, + "time_per_iteration": 2.591233730316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116515, + "balance_loss_mlp": 1.08137226, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07536960859650275, + "language_loss": 0.8969053, + "learning_rate": 0.0009800406647316776, + "loss": 0.90807045, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.35180664, + "step": 611, + "time_per_iteration": 2.8590939044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199931, + "balance_loss_mlp": 1.17360973, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.02828241364524735, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.7811439, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.26367188, + "step": 612, + "time_per_iteration": 4.794836759567261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126093, + "balance_loss_mlp": 1.08999705, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07086643363198573, + "language_loss": 0.88838685, + "learning_rate": 0.000979866002183916, + "loss": 0.89964771, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.36132812, + "step": 613, + "time_per_iteration": 2.6570141315460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113543, + "balance_loss_mlp": 1.07711244, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.0718552990374983, + "language_loss": 0.89756042, + "learning_rate": 0.0009797783914423082, + "loss": 0.90869588, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.36425781, + "step": 614, + "time_per_iteration": 2.8077588081359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104148, + "balance_loss_mlp": 1.06867135, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06673690234795807, + "language_loss": 0.84267712, + "learning_rate": 0.0009796905944342094, + "loss": 0.85371858, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.35498047, + "step": 615, + "time_per_iteration": 2.848975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109381, + "balance_loss_mlp": 1.07271254, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.05638104592328917, + "language_loss": 0.88746947, + "learning_rate": 0.0009796026111937057, + "loss": 0.89856327, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.36645508, + "step": 616, + "time_per_iteration": 2.6446924209594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099952, + "balance_loss_mlp": 1.06347418, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.0626967176734064, + "language_loss": 0.88544255, + "learning_rate": 0.0009795144417549552, + "loss": 0.89644206, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.36474609, + "step": 617, + "time_per_iteration": 2.69419527053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.0669111, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.05994069078035177, + "language_loss": 0.89591199, + "learning_rate": 0.0009794260861521883, + "loss": 0.90694714, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.36621094, + "step": 618, + "time_per_iteration": 2.771303653717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098344, + "balance_loss_mlp": 1.06262898, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.09079788596459537, + "language_loss": 0.86586368, + "learning_rate": 0.0009793375444197075, + "loss": 0.87684715, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.35742188, + "step": 619, + "time_per_iteration": 2.6239778995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103508, + "balance_loss_mlp": 1.06724489, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.07776663130635876, + "language_loss": 0.84681749, + "learning_rate": 0.000979248816591888, + "loss": 0.85785258, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.36254883, + "step": 620, + "time_per_iteration": 2.7932288646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_mlp": 1.07043433, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06665125523581683, + "language_loss": 0.85644066, + "learning_rate": 0.0009791599027031766, + "loss": 0.86750811, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.36303711, + "step": 621, + "time_per_iteration": 3.0138871669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108028, + "balance_loss_mlp": 1.0721699, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.06722173914854768, + "language_loss": 0.85452718, + "learning_rate": 0.0009790708027880932, + "loss": 0.86560744, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.359375, + "step": 622, + "time_per_iteration": 2.8520967960357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217123, + "balance_loss_mlp": 1.192518, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.04692620020290901, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78644413, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.24511719, + "step": 623, + "time_per_iteration": 4.820342302322388 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117536, + "balance_loss_mlp": 1.08251202, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.0795104629545964, + "language_loss": 0.93134129, + "learning_rate": 0.0009788920450172487, + "loss": 0.94251657, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.35058594, + "step": 624, + "time_per_iteration": 2.617030143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112825, + "balance_loss_mlp": 1.09265435, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.07884849751459712, + "language_loss": 0.90174961, + "learning_rate": 0.0009788023872308875, + "loss": 0.91303217, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.35620117, + "step": 625, + "time_per_iteration": 2.5254392623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218941, + "balance_loss_mlp": 1.19519401, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02704118444179952, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76647937, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.23730469, + "step": 626, + "time_per_iteration": 4.7286646366119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114575, + "balance_loss_mlp": 1.07936025, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.06954804859514781, + "language_loss": 0.9379338, + "learning_rate": 0.0009786225140303285, + "loss": 0.94907951, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.35253906, + "step": 627, + "time_per_iteration": 2.648557424545288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117384, + "balance_loss_mlp": 1.08155024, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.07877419782543724, + "language_loss": 0.91490531, + "learning_rate": 0.0009785322986859634, + "loss": 0.92607915, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.35864258, + "step": 628, + "time_per_iteration": 2.7282159328460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125787, + "balance_loss_mlp": 1.09014332, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.07794762914430453, + "language_loss": 0.92512405, + "learning_rate": 0.0009784418975588838, + "loss": 0.936382, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.35668945, + "step": 629, + "time_per_iteration": 2.709716320037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117597, + "balance_loss_mlp": 1.08099949, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.06704717834334661, + "language_loss": 0.92910212, + "learning_rate": 0.0009783513106841862, + "loss": 0.94027811, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.3659668, + "step": 630, + "time_per_iteration": 2.7247745990753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268595, + "balance_loss_mlp": 1.24303675, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.050831706918094084, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.78001297, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.25585938, + "step": 631, + "time_per_iteration": 4.973435163497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108872, + "balance_loss_mlp": 1.07263255, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.05936012058015608, + "language_loss": 0.87115383, + "learning_rate": 0.0009781695798326854, + "loss": 0.88224256, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.36303711, + "step": 632, + "time_per_iteration": 2.6066951751708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107124, + "balance_loss_mlp": 1.07109857, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.07579280109985519, + "language_loss": 0.87447512, + "learning_rate": 0.0009780784359264365, + "loss": 0.88554639, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.3605957, + "step": 633, + "time_per_iteration": 2.6627957820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232879, + "balance_loss_mlp": 1.20541322, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.035928730821781295, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75421578, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.27539062, + "step": 634, + "time_per_iteration": 4.774393796920776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097947, + "balance_loss_mlp": 1.06185055, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.06269897945868624, + "language_loss": 0.87202692, + "learning_rate": 0.000977895591329867, + "loss": 0.88300645, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.36108398, + "step": 635, + "time_per_iteration": 2.805889129638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108158, + "balance_loss_mlp": 1.0710839, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0813284132777598, + "language_loss": 0.86332333, + "learning_rate": 0.000977803890710533, + "loss": 0.87440491, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37060547, + "step": 636, + "time_per_iteration": 2.740208864212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.06927526, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.05990721463683031, + "language_loss": 0.92840338, + "learning_rate": 0.0009777120045912774, + "loss": 0.93944776, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35205078, + "step": 637, + "time_per_iteration": 2.599487543106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099731, + "balance_loss_mlp": 1.06246591, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06926890859373311, + "language_loss": 0.89462954, + "learning_rate": 0.0009776199330077736, + "loss": 0.90562689, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.37231445, + "step": 638, + "time_per_iteration": 2.7127702236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109285, + "balance_loss_mlp": 1.07263994, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.06829584029278382, + "language_loss": 0.91875821, + "learning_rate": 0.0009775276759957667, + "loss": 0.92985106, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36645508, + "step": 639, + "time_per_iteration": 2.7092959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109534, + "balance_loss_mlp": 1.07269859, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08396579350539743, + "language_loss": 0.8972953, + "learning_rate": 0.0009774352335910745, + "loss": 0.90839064, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.36816406, + "step": 640, + "time_per_iteration": 2.810391664505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104708, + "balance_loss_mlp": 1.067729, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07323302973942612, + "language_loss": 0.94222069, + "learning_rate": 0.000977342605829586, + "loss": 0.95326775, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.36962891, + "step": 641, + "time_per_iteration": 2.7107834815979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112179, + "balance_loss_mlp": 1.07624888, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.07665420533577341, + "language_loss": 0.85291827, + "learning_rate": 0.0009772497927472623, + "loss": 0.86404008, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.359375, + "step": 642, + "time_per_iteration": 3.0403058528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116924, + "balance_loss_mlp": 1.08006442, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.07222690714452404, + "language_loss": 0.84284675, + "learning_rate": 0.0009771567943801368, + "loss": 0.85401607, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.3684082, + "step": 643, + "time_per_iteration": 2.684351682662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.07615817, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.07333206449495522, + "language_loss": 0.88927472, + "learning_rate": 0.0009770636107643152, + "loss": 0.9004004, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.36450195, + "step": 644, + "time_per_iteration": 2.697791337966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124284, + "balance_loss_mlp": 1.0884738, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.07501614361753556, + "language_loss": 0.87213039, + "learning_rate": 0.0009769702419359738, + "loss": 0.88337326, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.35864258, + "step": 645, + "time_per_iteration": 2.614753246307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132185, + "balance_loss_mlp": 1.09604049, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.08258832766371556, + "language_loss": 0.88905025, + "learning_rate": 0.000976876687931362, + "loss": 0.90037215, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.36181641, + "step": 646, + "time_per_iteration": 2.9785215854644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125302, + "balance_loss_mlp": 1.08853781, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.0911173559535341, + "language_loss": 0.84276652, + "learning_rate": 0.0009767829487868005, + "loss": 0.85401952, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.36767578, + "step": 647, + "time_per_iteration": 2.578190326690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115883, + "balance_loss_mlp": 1.07911873, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07020857762254842, + "language_loss": 0.88315135, + "learning_rate": 0.000976689024538682, + "loss": 0.89431018, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36743164, + "step": 648, + "time_per_iteration": 2.6223652362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111573, + "balance_loss_mlp": 1.07841754, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.08555408637061691, + "language_loss": 0.86419356, + "learning_rate": 0.0009765949152234716, + "loss": 0.87535083, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.37280273, + "step": 649, + "time_per_iteration": 2.882483959197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304556, + "balance_loss_mlp": 1.27480125, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.07016402939707722, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79990637, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.296875, + "step": 650, + "time_per_iteration": 4.66938042640686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.05882525, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.06927891842453628, + "language_loss": 0.81679136, + "learning_rate": 0.0009764061415379919, + "loss": 0.82773983, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.36035156, + "step": 651, + "time_per_iteration": 3.2698771953582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096413, + "balance_loss_mlp": 1.05874252, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07412805631018828, + "language_loss": 0.88318801, + "learning_rate": 0.0009763114772410109, + "loss": 0.89415216, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.37646484, + "step": 652, + "time_per_iteration": 2.5928001403808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.0775615, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.06901346528680578, + "language_loss": 0.85726613, + "learning_rate": 0.0009762166280235146, + "loss": 0.86842012, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.37817383, + "step": 653, + "time_per_iteration": 2.954763412475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135328, + "balance_loss_mlp": 1.0974437, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.10573688852470094, + "language_loss": 0.86465615, + "learning_rate": 0.0009761215939223267, + "loss": 0.87600946, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37866211, + "step": 654, + "time_per_iteration": 2.715884208679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_mlp": 1.09599805, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.09937756240260763, + "language_loss": 0.85917866, + "learning_rate": 0.0009760263749743428, + "loss": 0.87050724, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.3684082, + "step": 655, + "time_per_iteration": 2.565927505493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115454, + "balance_loss_mlp": 1.07847536, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.07472608136964497, + "language_loss": 0.89487195, + "learning_rate": 0.0009759309712165299, + "loss": 0.90602648, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.36962891, + "step": 656, + "time_per_iteration": 2.721547842025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095241, + "balance_loss_mlp": 1.06002665, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.06565081457641837, + "language_loss": 0.92494375, + "learning_rate": 0.0009758353826859272, + "loss": 0.9358961, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.3527832, + "step": 657, + "time_per_iteration": 2.6744871139526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095397, + "balance_loss_mlp": 1.05891895, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.09523432489761414, + "language_loss": 0.88095021, + "learning_rate": 0.0009757396094196456, + "loss": 0.89190418, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36499023, + "step": 658, + "time_per_iteration": 2.909353256225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103392, + "balance_loss_mlp": 1.06801057, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.06690202483268812, + "language_loss": 0.8320483, + "learning_rate": 0.0009756436514548673, + "loss": 0.84308219, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.35449219, + "step": 659, + "time_per_iteration": 2.865816831588745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096361, + "balance_loss_mlp": 1.06143236, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.06842887259152383, + "language_loss": 0.87790155, + "learning_rate": 0.0009755475088288466, + "loss": 0.88886517, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34985352, + "step": 660, + "time_per_iteration": 2.727024793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095382, + "balance_loss_mlp": 1.06145549, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.09688683984474739, + "language_loss": 0.89628965, + "learning_rate": 0.0009754511815789095, + "loss": 0.90724349, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.33959961, + "step": 661, + "time_per_iteration": 2.857279062271118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099651, + "balance_loss_mlp": 1.06441295, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.0675215866547423, + "language_loss": 0.85062414, + "learning_rate": 0.0009753546697424533, + "loss": 0.86162066, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.3527832, + "step": 662, + "time_per_iteration": 2.670924425125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112335, + "balance_loss_mlp": 1.07750201, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.0877117205425541, + "language_loss": 0.89430654, + "learning_rate": 0.0009752579733569475, + "loss": 0.90542984, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.34887695, + "step": 663, + "time_per_iteration": 2.708876609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270721, + "balance_loss_mlp": 1.24678338, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.04579657173262409, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7615211, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.23925781, + "step": 664, + "time_per_iteration": 4.956411123275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112296, + "balance_loss_mlp": 1.07724893, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.07589772420679435, + "language_loss": 0.88920283, + "learning_rate": 0.0009750640270890217, + "loss": 0.90032578, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.35083008, + "step": 665, + "time_per_iteration": 2.7128844261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111829, + "balance_loss_mlp": 1.08357668, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.09170618066625874, + "language_loss": 0.9529534, + "learning_rate": 0.0009749667772818983, + "loss": 0.9641363, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.34765625, + "step": 666, + "time_per_iteration": 3.001779794692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119074, + "balance_loss_mlp": 1.16718388, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.026171542208985103, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78126681, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.23535156, + "step": 667, + "time_per_iteration": 4.816860914230347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097707, + "balance_loss_mlp": 1.06239688, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.08174433959814813, + "language_loss": 0.94348264, + "learning_rate": 0.0009747717245101093, + "loss": 0.95445979, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.35351562, + "step": 668, + "time_per_iteration": 2.5237252712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092064, + "balance_loss_mlp": 1.05851901, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.09843416488997592, + "language_loss": 0.84683162, + "learning_rate": 0.00097467392162117, + "loss": 0.85775226, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33544922, + "step": 669, + "time_per_iteration": 2.6030120849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103326, + "balance_loss_mlp": 1.06987596, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.06975318327908253, + "language_loss": 0.90683615, + "learning_rate": 0.0009745759344474708, + "loss": 0.91786939, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.3347168, + "step": 670, + "time_per_iteration": 2.81622576713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112132, + "balance_loss_mlp": 1.08779824, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.09191121702256037, + "language_loss": 0.88668084, + "learning_rate": 0.0009744777630270536, + "loss": 0.89789402, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.33544922, + "step": 671, + "time_per_iteration": 2.573746681213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131545, + "balance_loss_mlp": 1.09673548, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.0798229463492689, + "language_loss": 0.92632008, + "learning_rate": 0.000974379407398032, + "loss": 0.93763554, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.34863281, + "step": 672, + "time_per_iteration": 2.8804330825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128596, + "balance_loss_mlp": 1.09471667, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.060594592327224854, + "language_loss": 0.81539643, + "learning_rate": 0.0009742808675985913, + "loss": 0.82668233, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33911133, + "step": 673, + "time_per_iteration": 3.093003273010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144697, + "balance_loss_mlp": 1.11019778, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.09187527541403225, + "language_loss": 0.90132761, + "learning_rate": 0.0009741821436669876, + "loss": 0.91277468, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.34521484, + "step": 674, + "time_per_iteration": 2.585315227508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122846, + "balance_loss_mlp": 1.08925223, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.08498532425721701, + "language_loss": 0.91794449, + "learning_rate": 0.0009740832356415492, + "loss": 0.92917299, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.3359375, + "step": 675, + "time_per_iteration": 2.4971120357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112081, + "balance_loss_mlp": 1.08714533, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.07677288344190451, + "language_loss": 0.87289226, + "learning_rate": 0.0009739841435606756, + "loss": 0.88410038, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.33691406, + "step": 676, + "time_per_iteration": 3.04789137840271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110492, + "balance_loss_mlp": 1.07670832, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.05631932912809994, + "language_loss": 0.89408028, + "learning_rate": 0.0009738848674628377, + "loss": 0.90518522, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.33789062, + "step": 677, + "time_per_iteration": 2.7033560276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115712, + "balance_loss_mlp": 1.08161807, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06061927769746001, + "language_loss": 0.88112855, + "learning_rate": 0.000973785407386578, + "loss": 0.8922857, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.34130859, + "step": 678, + "time_per_iteration": 2.7593955993652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111886, + "balance_loss_mlp": 1.07671893, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.0561156652888081, + "language_loss": 0.86748564, + "learning_rate": 0.0009736857633705103, + "loss": 0.87860453, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.35180664, + "step": 679, + "time_per_iteration": 2.859600067138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104313, + "balance_loss_mlp": 1.07002795, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.058910355701146846, + "language_loss": 0.92178285, + "learning_rate": 0.0009735859354533196, + "loss": 0.93282604, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.34301758, + "step": 680, + "time_per_iteration": 2.7124130725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097833, + "balance_loss_mlp": 1.06321418, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.0839399897160516, + "language_loss": 0.91048056, + "learning_rate": 0.0009734859236737628, + "loss": 0.92145896, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.34643555, + "step": 681, + "time_per_iteration": 2.627246856689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097126, + "balance_loss_mlp": 1.06102967, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.07457249787820815, + "language_loss": 0.92922121, + "learning_rate": 0.0009733857280706678, + "loss": 0.94019246, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.36108398, + "step": 682, + "time_per_iteration": 2.656088352203369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06669104, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.08799075641073119, + "language_loss": 0.83452725, + "learning_rate": 0.000973285348682934, + "loss": 0.84554225, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.34838867, + "step": 683, + "time_per_iteration": 2.714932441711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250838, + "balance_loss_mlp": 1.22547078, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.05910904833943088, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.7914921, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.25390625, + "step": 684, + "time_per_iteration": 4.823149681091309 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102069, + "balance_loss_mlp": 1.06754637, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.06093749611395137, + "language_loss": 0.84928876, + "learning_rate": 0.0009730840387095046, + "loss": 0.86030942, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.34570312, + "step": 685, + "time_per_iteration": 3.2810635566711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112887, + "balance_loss_mlp": 1.07876921, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.0719979787644836, + "language_loss": 0.90753949, + "learning_rate": 0.0009729831082019642, + "loss": 0.91866839, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.34155273, + "step": 686, + "time_per_iteration": 2.8016421794891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121765, + "balance_loss_mlp": 1.08740878, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.06743381273529321, + "language_loss": 0.88199198, + "learning_rate": 0.0009728819940660958, + "loss": 0.89320958, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34375, + "step": 687, + "time_per_iteration": 2.753110885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123642, + "balance_loss_mlp": 1.08966768, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07411002639607889, + "language_loss": 0.84702134, + "learning_rate": 0.0009727806963411557, + "loss": 0.85825777, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.34008789, + "step": 688, + "time_per_iteration": 2.638277292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118088, + "balance_loss_mlp": 1.08342147, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.07589947069642403, + "language_loss": 0.86972356, + "learning_rate": 0.000972679215066471, + "loss": 0.88090444, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.34692383, + "step": 689, + "time_per_iteration": 2.6977994441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102626, + "balance_loss_mlp": 1.06865191, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07819243817703804, + "language_loss": 0.98617494, + "learning_rate": 0.0009725775502814401, + "loss": 0.99720132, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.33984375, + "step": 690, + "time_per_iteration": 2.648946523666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094739, + "balance_loss_mlp": 1.05864239, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.059114915842817355, + "language_loss": 0.84878647, + "learning_rate": 0.0009724757020255327, + "loss": 0.85973388, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.36108398, + "step": 691, + "time_per_iteration": 2.8732690811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082897, + "balance_loss_mlp": 1.04782593, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.07438205452368939, + "language_loss": 0.87005877, + "learning_rate": 0.0009723736703382902, + "loss": 0.88088775, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.35107422, + "step": 692, + "time_per_iteration": 2.554645299911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107941, + "balance_loss_mlp": 1.04352796, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.08618570028449021, + "language_loss": 0.82726276, + "learning_rate": 0.0009722714552593244, + "loss": 0.8380568, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.35888672, + "step": 693, + "time_per_iteration": 2.6300699710845947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108349, + "balance_loss_mlp": 1.04763222, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.09336455895373029, + "language_loss": 0.93701726, + "learning_rate": 0.000972169056828319, + "loss": 0.94785213, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35864258, + "step": 694, + "time_per_iteration": 2.4744653701782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089355, + "balance_loss_mlp": 1.05309105, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.09775538219544704, + "language_loss": 0.87267971, + "learning_rate": 0.0009720664750850283, + "loss": 0.88357329, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.36279297, + "step": 695, + "time_per_iteration": 2.819199562072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087558, + "balance_loss_mlp": 1.05196249, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.08995446617022443, + "language_loss": 0.92670894, + "learning_rate": 0.0009719637100692784, + "loss": 0.93758452, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.35644531, + "step": 696, + "time_per_iteration": 2.710566997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089346, + "balance_loss_mlp": 1.05460882, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.07471473065547057, + "language_loss": 0.82606006, + "learning_rate": 0.0009718607618209661, + "loss": 0.83695352, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.34765625, + "step": 697, + "time_per_iteration": 2.860895872116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100417, + "balance_loss_mlp": 1.06677604, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.06757273414028586, + "language_loss": 0.87573737, + "learning_rate": 0.0009717576303800595, + "loss": 0.88674152, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33666992, + "step": 698, + "time_per_iteration": 3.044128894805908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105218, + "balance_loss_mlp": 1.07102871, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.06392403589518669, + "language_loss": 0.85563833, + "learning_rate": 0.0009716543157865975, + "loss": 0.86669052, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.34228516, + "step": 699, + "time_per_iteration": 2.6879220008850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124277, + "balance_loss_mlp": 1.08968258, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.10281325358067626, + "language_loss": 0.83577156, + "learning_rate": 0.0009715508180806907, + "loss": 0.84701437, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34643555, + "step": 700, + "time_per_iteration": 3.1908302307128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132528, + "balance_loss_mlp": 1.09848189, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07337445630948206, + "language_loss": 0.89328271, + "learning_rate": 0.0009714471373025202, + "loss": 0.90460801, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.34082031, + "step": 701, + "time_per_iteration": 3.438918113708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121396, + "balance_loss_mlp": 1.08704007, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.06971370423164719, + "language_loss": 0.88653499, + "learning_rate": 0.0009713432734923386, + "loss": 0.89774895, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.34399414, + "step": 702, + "time_per_iteration": 2.640204668045044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.08372688, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06937758634579687, + "language_loss": 0.8635335, + "learning_rate": 0.0009712392266904696, + "loss": 0.87472308, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.3527832, + "step": 703, + "time_per_iteration": 2.7081639766693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107456, + "balance_loss_mlp": 1.07381546, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.059624368341773884, + "language_loss": 0.8470363, + "learning_rate": 0.0009711349969373076, + "loss": 0.8581109, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33666992, + "step": 704, + "time_per_iteration": 3.185788154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_mlp": 1.08629751, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.06837289886431508, + "language_loss": 0.80139232, + "learning_rate": 0.0009710305842733178, + "loss": 0.81259602, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34106445, + "step": 705, + "time_per_iteration": 2.7622249126434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_mlp": 1.08534753, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.07938339172549091, + "language_loss": 0.89516854, + "learning_rate": 0.0009709259887390373, + "loss": 0.90636754, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.34570312, + "step": 706, + "time_per_iteration": 2.5919415950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112775, + "balance_loss_mlp": 1.09141469, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.10398540964391637, + "language_loss": 0.90775406, + "learning_rate": 0.0009708212103750737, + "loss": 0.9190315, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.36328125, + "step": 707, + "time_per_iteration": 2.601414680480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118489, + "balance_loss_mlp": 1.0827502, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.10289617102375577, + "language_loss": 0.87215245, + "learning_rate": 0.0009707162492221051, + "loss": 0.88333738, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.35766602, + "step": 708, + "time_per_iteration": 2.9150781631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107244, + "balance_loss_mlp": 1.07193458, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07053364895365258, + "language_loss": 0.88057113, + "learning_rate": 0.0009706111053208815, + "loss": 0.89164358, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.35375977, + "step": 709, + "time_per_iteration": 2.8282904624938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104491, + "balance_loss_mlp": 1.06801295, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06130049777218646, + "language_loss": 0.85717642, + "learning_rate": 0.0009705057787122232, + "loss": 0.86822134, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.36499023, + "step": 710, + "time_per_iteration": 2.577875852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115861, + "balance_loss_mlp": 1.07890666, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.06671527486676954, + "language_loss": 0.91032815, + "learning_rate": 0.0009704002694370216, + "loss": 0.92148674, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.36962891, + "step": 711, + "time_per_iteration": 2.5226385593414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113129, + "balance_loss_mlp": 1.09509826, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06767720569390717, + "language_loss": 0.8601349, + "learning_rate": 0.0009702945775362388, + "loss": 0.8714478, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.36206055, + "step": 712, + "time_per_iteration": 2.6134419441223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129624, + "balance_loss_mlp": 1.09214449, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06923332159298135, + "language_loss": 0.86543357, + "learning_rate": 0.0009701887030509086, + "loss": 0.87672985, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.37426758, + "step": 713, + "time_per_iteration": 2.6801493167877197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.08735013, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.08447530320779993, + "language_loss": 0.90941691, + "learning_rate": 0.0009700826460221346, + "loss": 0.92065662, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.36645508, + "step": 714, + "time_per_iteration": 2.6499831676483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124692, + "balance_loss_mlp": 1.0878799, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.08158263793675288, + "language_loss": 0.92094153, + "learning_rate": 0.0009699764064910921, + "loss": 0.93218845, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.36816406, + "step": 715, + "time_per_iteration": 2.8663330078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100593, + "balance_loss_mlp": 1.0652591, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.0638700652453299, + "language_loss": 0.86489999, + "learning_rate": 0.0009698699844990268, + "loss": 0.87590599, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.35351562, + "step": 716, + "time_per_iteration": 2.680769443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097936, + "balance_loss_mlp": 1.06236374, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.06268585455781102, + "language_loss": 0.87917447, + "learning_rate": 0.0009697633800872555, + "loss": 0.89015377, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.35595703, + "step": 717, + "time_per_iteration": 2.965280532836914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095044, + "balance_loss_mlp": 1.05956769, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.06824665625382514, + "language_loss": 0.9079777, + "learning_rate": 0.0009696565932971655, + "loss": 0.91892809, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.35498047, + "step": 718, + "time_per_iteration": 2.896911144256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.05451119, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.09498294885790176, + "language_loss": 0.89284754, + "learning_rate": 0.0009695496241702153, + "loss": 0.90374601, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.35375977, + "step": 719, + "time_per_iteration": 2.7762036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100053, + "balance_loss_mlp": 1.0647912, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.06645840883514359, + "language_loss": 0.85660797, + "learning_rate": 0.0009694424727479339, + "loss": 0.86760849, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.3527832, + "step": 720, + "time_per_iteration": 2.899481773376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105877, + "balance_loss_mlp": 1.06997156, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.0836580120862117, + "language_loss": 0.88687581, + "learning_rate": 0.0009693351390719213, + "loss": 0.89793456, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.35913086, + "step": 721, + "time_per_iteration": 2.7242469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.08071184, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.0677561083547336, + "language_loss": 0.90886325, + "learning_rate": 0.000969227623183848, + "loss": 0.9200263, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.35595703, + "step": 722, + "time_per_iteration": 2.819762706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122367, + "balance_loss_mlp": 1.08719993, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.06096675577850975, + "language_loss": 0.9079504, + "learning_rate": 0.0009691199251254554, + "loss": 0.91917408, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.35180664, + "step": 723, + "time_per_iteration": 2.9057154655456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111876, + "balance_loss_mlp": 1.08368921, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.07869545166834224, + "language_loss": 0.86502081, + "learning_rate": 0.0009690120449385555, + "loss": 0.87620842, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.35107422, + "step": 724, + "time_per_iteration": 2.753779411315918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117404, + "balance_loss_mlp": 1.08164096, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.05745765153927115, + "language_loss": 0.92949581, + "learning_rate": 0.0009689039826650312, + "loss": 0.94066983, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.35791016, + "step": 725, + "time_per_iteration": 2.7707176208496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01358579, + "balance_loss_mlp": 1.33788455, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.08980106345901108, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77881646, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.20703125, + "step": 726, + "time_per_iteration": 4.990100383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122131, + "balance_loss_mlp": 1.08632064, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.08882129772973828, + "language_loss": 0.8687858, + "learning_rate": 0.0009686873120259941, + "loss": 0.88000709, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35839844, + "step": 727, + "time_per_iteration": 2.598994255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124101, + "balance_loss_mlp": 1.08914924, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.060515823337661194, + "language_loss": 0.86860693, + "learning_rate": 0.0009685787037446004, + "loss": 0.879848, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.34985352, + "step": 728, + "time_per_iteration": 2.818753957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117164, + "balance_loss_mlp": 1.08252215, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.07103959200550099, + "language_loss": 0.86954272, + "learning_rate": 0.0009684699135448201, + "loss": 0.88071442, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34667969, + "step": 729, + "time_per_iteration": 2.7140605449676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117139, + "balance_loss_mlp": 1.08190084, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.05207553557344927, + "language_loss": 0.91554511, + "learning_rate": 0.0009683609414688895, + "loss": 0.92671645, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.3527832, + "step": 730, + "time_per_iteration": 2.700392961502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115792, + "balance_loss_mlp": 1.08076811, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.0649489891311747, + "language_loss": 0.85963869, + "learning_rate": 0.0009682517875591154, + "loss": 0.87079668, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35058594, + "step": 731, + "time_per_iteration": 2.7288033962249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108316, + "balance_loss_mlp": 1.07329249, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.08055333626892905, + "language_loss": 0.8568505, + "learning_rate": 0.0009681424518578749, + "loss": 0.86793363, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.35058594, + "step": 732, + "time_per_iteration": 2.7607100009918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098228, + "balance_loss_mlp": 1.06337106, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.057006483972196494, + "language_loss": 0.87377727, + "learning_rate": 0.000968032934407616, + "loss": 0.8847596, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.34912109, + "step": 733, + "time_per_iteration": 2.5924746990203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109135, + "balance_loss_mlp": 1.05708933, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06839942690263572, + "language_loss": 0.81019294, + "learning_rate": 0.0009679232352508571, + "loss": 0.82110655, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.34301758, + "step": 734, + "time_per_iteration": 2.7993721961975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.06455231, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05863508932167985, + "language_loss": 0.80278933, + "learning_rate": 0.0009678133544301871, + "loss": 0.8137973, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.36254883, + "step": 735, + "time_per_iteration": 2.673874855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094272, + "balance_loss_mlp": 1.05881953, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.05551108490857041, + "language_loss": 0.91367602, + "learning_rate": 0.0009677032919882658, + "loss": 0.92461878, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.35473633, + "step": 736, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096366, + "balance_loss_mlp": 1.06012654, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07346959128329188, + "language_loss": 0.91181809, + "learning_rate": 0.000967593047967823, + "loss": 0.92278177, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.36230469, + "step": 737, + "time_per_iteration": 2.559713125228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096247, + "balance_loss_mlp": 1.06096137, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08415375039396082, + "language_loss": 0.86267197, + "learning_rate": 0.0009674826224116593, + "loss": 0.87363446, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.35302734, + "step": 738, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097639, + "balance_loss_mlp": 1.06197131, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.07057178035488912, + "language_loss": 0.86339009, + "learning_rate": 0.0009673720153626455, + "loss": 0.87436646, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.35668945, + "step": 739, + "time_per_iteration": 2.612968683242798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104371, + "balance_loss_mlp": 1.06848931, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.07271668848978735, + "language_loss": 0.87052834, + "learning_rate": 0.0009672612268637235, + "loss": 0.88157207, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.35913086, + "step": 740, + "time_per_iteration": 2.61069393157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110664, + "balance_loss_mlp": 1.0753777, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0891355718419961, + "language_loss": 0.84501529, + "learning_rate": 0.0009671502569579048, + "loss": 0.85612196, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.35302734, + "step": 741, + "time_per_iteration": 2.735647201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110487, + "balance_loss_mlp": 1.07122874, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.08695556970227908, + "language_loss": 0.89623845, + "learning_rate": 0.0009670391056882719, + "loss": 0.90728712, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.33666992, + "step": 742, + "time_per_iteration": 2.7107605934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112128, + "balance_loss_mlp": 1.07879674, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07027307452403737, + "language_loss": 0.88442421, + "learning_rate": 0.0009669277730979776, + "loss": 0.89554548, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.33349609, + "step": 743, + "time_per_iteration": 3.188511371612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106141, + "balance_loss_mlp": 1.07295275, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.060274127994165407, + "language_loss": 0.85487998, + "learning_rate": 0.0009668162592302449, + "loss": 0.86594141, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.33203125, + "step": 744, + "time_per_iteration": 2.912363290786743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111089, + "balance_loss_mlp": 1.07715416, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.05989361998422495, + "language_loss": 0.86368543, + "learning_rate": 0.0009667045641283676, + "loss": 0.8747943, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.33764648, + "step": 745, + "time_per_iteration": 2.705873489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105291, + "balance_loss_mlp": 1.07246089, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07442691981713179, + "language_loss": 0.94493437, + "learning_rate": 0.0009665926878357092, + "loss": 0.95598727, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32836914, + "step": 746, + "time_per_iteration": 2.941594362258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112013, + "balance_loss_mlp": 1.07865858, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.0692560914525881, + "language_loss": 0.91247988, + "learning_rate": 0.0009664806303957043, + "loss": 0.92359996, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.33374023, + "step": 747, + "time_per_iteration": 2.70877742767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112762, + "balance_loss_mlp": 1.0790261, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06347995643195156, + "language_loss": 0.87284487, + "learning_rate": 0.0009663683918518571, + "loss": 0.88397241, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.33764648, + "step": 748, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128804, + "balance_loss_mlp": 1.09583056, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.07165520049303264, + "language_loss": 0.85690349, + "learning_rate": 0.0009662559722477428, + "loss": 0.8681916, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.32983398, + "step": 749, + "time_per_iteration": 2.6703925132751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01293618, + "balance_loss_mlp": 1.26653337, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05750783583060037, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77456594, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.27148438, + "step": 750, + "time_per_iteration": 5.001406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114826, + "balance_loss_mlp": 1.11492896, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.0903406164143912, + "language_loss": 0.88906193, + "learning_rate": 0.0009660305900333632, + "loss": 0.90054452, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33349609, + "step": 751, + "time_per_iteration": 2.6897666454315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151429, + "balance_loss_mlp": 1.11859906, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.07731756572669998, + "language_loss": 0.82109559, + "learning_rate": 0.0009659176275105992, + "loss": 0.83260989, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.32836914, + "step": 752, + "time_per_iteration": 2.7144923210144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156541, + "balance_loss_mlp": 1.12294829, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.08104938710710845, + "language_loss": 0.8584373, + "learning_rate": 0.0009658044841025701, + "loss": 0.87000269, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.33618164, + "step": 753, + "time_per_iteration": 2.7651891708374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134874, + "balance_loss_mlp": 1.10116172, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.06446620792536047, + "language_loss": 0.80912805, + "learning_rate": 0.0009656911598532021, + "loss": 0.82047671, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.33740234, + "step": 754, + "time_per_iteration": 2.6575491428375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113793, + "balance_loss_mlp": 1.10345459, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.0617560649750725, + "language_loss": 0.89835, + "learning_rate": 0.0009655776548064917, + "loss": 0.90972924, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.3449707, + "step": 755, + "time_per_iteration": 2.684224843978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133332, + "balance_loss_mlp": 1.100263, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.0723196770544797, + "language_loss": 0.88265425, + "learning_rate": 0.0009654639690065054, + "loss": 0.89398754, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33081055, + "step": 756, + "time_per_iteration": 2.8975589275360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133271, + "balance_loss_mlp": 1.10063124, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0666179485403068, + "language_loss": 0.87639153, + "learning_rate": 0.00096535010249738, + "loss": 0.88772416, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.32641602, + "step": 757, + "time_per_iteration": 2.7852935791015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118669, + "balance_loss_mlp": 1.08555305, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.06671579144124269, + "language_loss": 0.82458985, + "learning_rate": 0.0009652360553233224, + "loss": 0.83577645, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33129883, + "step": 758, + "time_per_iteration": 2.790372610092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231318, + "balance_loss_mlp": 1.20690441, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.06334391267713868, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.75005066, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.24414062, + "step": 759, + "time_per_iteration": 4.9441094398498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113265, + "balance_loss_mlp": 1.08062565, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.06716213865762054, + "language_loss": 0.81441242, + "learning_rate": 0.0009650074191575883, + "loss": 0.82554507, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.32641602, + "step": 760, + "time_per_iteration": 3.2887775897979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.07664585, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.06510043774355635, + "language_loss": 0.85560381, + "learning_rate": 0.0009648928302546766, + "loss": 0.86670047, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.33032227, + "step": 761, + "time_per_iteration": 2.6996572017669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.06308818, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.06592560206527708, + "language_loss": 0.85148716, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244226, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.32421875, + "step": 762, + "time_per_iteration": 3.3860111236572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101334, + "balance_loss_mlp": 1.06843269, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.08422515931666542, + "language_loss": 0.87252343, + "learning_rate": 0.0009646631110312001, + "loss": 0.88353688, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.32910156, + "step": 763, + "time_per_iteration": 2.62996506690979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097579, + "balance_loss_mlp": 1.06455803, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05843071383105212, + "language_loss": 0.88439989, + "learning_rate": 0.0009645479807998203, + "loss": 0.89537567, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.33032227, + "step": 764, + "time_per_iteration": 2.7762649059295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091998, + "balance_loss_mlp": 1.059955, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06085607876830046, + "language_loss": 0.92027354, + "learning_rate": 0.0009644326702149196, + "loss": 0.93119353, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.3203125, + "step": 765, + "time_per_iteration": 2.7927489280700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093686, + "balance_loss_mlp": 1.0607841, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.07854715386493856, + "language_loss": 0.84577298, + "learning_rate": 0.0009643171793212653, + "loss": 0.85670984, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.32910156, + "step": 766, + "time_per_iteration": 3.1133480072021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_mlp": 1.05976951, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.102413583922894, + "language_loss": 0.89411926, + "learning_rate": 0.0009642015081636952, + "loss": 0.90504193, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.32495117, + "step": 767, + "time_per_iteration": 2.715090274810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098654, + "balance_loss_mlp": 1.06658697, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.07135930824346515, + "language_loss": 0.8782866, + "learning_rate": 0.0009640856567871166, + "loss": 0.88927317, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32055664, + "step": 768, + "time_per_iteration": 2.550196409225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105008, + "balance_loss_mlp": 1.07258272, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.05799185647214189, + "language_loss": 0.8870768, + "learning_rate": 0.0009639696252365072, + "loss": 0.8981269, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.32421875, + "step": 769, + "time_per_iteration": 3.0786449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100829, + "balance_loss_mlp": 1.06869006, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.05886019056348146, + "language_loss": 0.81861567, + "learning_rate": 0.0009638534135569144, + "loss": 0.82962394, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32128906, + "step": 770, + "time_per_iteration": 2.9026055335998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109058, + "balance_loss_mlp": 1.07641852, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.061687073411883335, + "language_loss": 0.89819336, + "learning_rate": 0.0009637370217934554, + "loss": 0.909284, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32641602, + "step": 771, + "time_per_iteration": 2.651155471801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_mlp": 1.07062733, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06890537390791286, + "language_loss": 0.82949096, + "learning_rate": 0.0009636204499913175, + "loss": 0.84051859, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32128906, + "step": 772, + "time_per_iteration": 2.8484935760498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109887, + "balance_loss_mlp": 1.06713676, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05724303399039588, + "language_loss": 0.88008785, + "learning_rate": 0.0009635036981957581, + "loss": 0.89107656, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.31713867, + "step": 773, + "time_per_iteration": 2.875896453857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098837, + "balance_loss_mlp": 1.06586373, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06792329386178385, + "language_loss": 0.90737289, + "learning_rate": 0.0009633867664521043, + "loss": 0.91836131, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32983398, + "step": 774, + "time_per_iteration": 2.8590240478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105117, + "balance_loss_mlp": 1.07202482, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.07543072164382301, + "language_loss": 0.86562771, + "learning_rate": 0.0009632696548057527, + "loss": 0.87667894, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33105469, + "step": 775, + "time_per_iteration": 2.598287343978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103717, + "balance_loss_mlp": 1.07136405, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.06953515395492163, + "language_loss": 0.8490293, + "learning_rate": 0.0009631523633021704, + "loss": 0.86006653, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.32348633, + "step": 776, + "time_per_iteration": 2.842017650604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097387, + "balance_loss_mlp": 1.0640794, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.0785359858255581, + "language_loss": 0.87875742, + "learning_rate": 0.0009630348919868936, + "loss": 0.88973129, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33325195, + "step": 777, + "time_per_iteration": 2.693345308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096515, + "balance_loss_mlp": 1.06244552, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.0986803150049228, + "language_loss": 0.81203282, + "learning_rate": 0.0009629172409055293, + "loss": 0.82299805, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34106445, + "step": 778, + "time_per_iteration": 2.50610613822937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100631, + "balance_loss_mlp": 1.06780052, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06451123510709528, + "language_loss": 0.872877, + "learning_rate": 0.0009627994101037531, + "loss": 0.88388336, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.32836914, + "step": 779, + "time_per_iteration": 2.735919713973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093349, + "balance_loss_mlp": 1.06016171, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06921626087658436, + "language_loss": 0.89007759, + "learning_rate": 0.0009626813996273114, + "loss": 0.90101105, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.33203125, + "step": 780, + "time_per_iteration": 2.8758528232574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089076, + "balance_loss_mlp": 1.05646062, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.07846674622794232, + "language_loss": 0.88800216, + "learning_rate": 0.0009625632095220198, + "loss": 0.89889288, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32617188, + "step": 781, + "time_per_iteration": 2.822981357574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091834, + "balance_loss_mlp": 1.05874181, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06496680151927305, + "language_loss": 0.86870086, + "learning_rate": 0.0009624448398337637, + "loss": 0.87961924, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.33105469, + "step": 782, + "time_per_iteration": 2.5370984077453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093814, + "balance_loss_mlp": 1.06022096, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.05765358341264215, + "language_loss": 0.89159006, + "learning_rate": 0.0009623262906084984, + "loss": 0.90252817, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.33618164, + "step": 783, + "time_per_iteration": 3.005157709121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099941, + "balance_loss_mlp": 1.06773031, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.06003141928684199, + "language_loss": 0.90186155, + "learning_rate": 0.0009622075618922486, + "loss": 0.91286093, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.32202148, + "step": 784, + "time_per_iteration": 2.660804510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093154, + "balance_loss_mlp": 1.06142032, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06057287359381707, + "language_loss": 0.86789852, + "learning_rate": 0.0009620886537311091, + "loss": 0.87883008, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.31713867, + "step": 785, + "time_per_iteration": 2.6273694038391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095369, + "balance_loss_mlp": 1.06210947, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.08138425523138582, + "language_loss": 0.84774673, + "learning_rate": 0.000961969566171244, + "loss": 0.85870039, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.33276367, + "step": 786, + "time_per_iteration": 2.5942111015319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095064, + "balance_loss_mlp": 1.06223416, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.07863928657369654, + "language_loss": 0.90186292, + "learning_rate": 0.0009618502992588873, + "loss": 0.9128136, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.32836914, + "step": 787, + "time_per_iteration": 2.619929790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091478, + "balance_loss_mlp": 1.05955386, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.0744293727729202, + "language_loss": 0.88114512, + "learning_rate": 0.0009617308530403424, + "loss": 0.89205992, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.3190918, + "step": 788, + "time_per_iteration": 2.9888041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093086, + "balance_loss_mlp": 1.0604943, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.06582928588586826, + "language_loss": 0.87262332, + "learning_rate": 0.0009616112275619825, + "loss": 0.8835541, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.32592773, + "step": 789, + "time_per_iteration": 2.7160654067993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099005, + "balance_loss_mlp": 1.0666275, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05890477263154721, + "language_loss": 0.83453441, + "learning_rate": 0.0009614914228702503, + "loss": 0.84552449, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.32373047, + "step": 790, + "time_per_iteration": 2.67269229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.07342279, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.05177473030839046, + "language_loss": 0.88909948, + "learning_rate": 0.0009613714390116581, + "loss": 0.90016103, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.32739258, + "step": 791, + "time_per_iteration": 2.978431224822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104946, + "balance_loss_mlp": 1.07304585, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.07017768347884551, + "language_loss": 0.8558737, + "learning_rate": 0.0009612512760327879, + "loss": 0.86692309, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.31884766, + "step": 792, + "time_per_iteration": 2.854128837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108697, + "balance_loss_mlp": 1.07562804, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06359759833531073, + "language_loss": 0.84205759, + "learning_rate": 0.0009611309339802909, + "loss": 0.85314453, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.33081055, + "step": 793, + "time_per_iteration": 2.46451997756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_mlp": 1.07510698, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.051071876240168755, + "language_loss": 0.84049302, + "learning_rate": 0.0009610104129008881, + "loss": 0.85157621, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.33227539, + "step": 794, + "time_per_iteration": 3.111494541168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.06836164, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06279651541206067, + "language_loss": 0.88408649, + "learning_rate": 0.0009608897128413701, + "loss": 0.89509147, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.32128906, + "step": 795, + "time_per_iteration": 2.7248153686523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103807, + "balance_loss_mlp": 1.07121563, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04889604688954522, + "language_loss": 0.85449052, + "learning_rate": 0.0009607688338485965, + "loss": 0.86552852, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.32592773, + "step": 796, + "time_per_iteration": 2.8646762371063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100645, + "balance_loss_mlp": 1.06731439, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.057433682914461805, + "language_loss": 0.90353924, + "learning_rate": 0.0009606477759694969, + "loss": 0.91454566, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.33349609, + "step": 797, + "time_per_iteration": 3.0346486568450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108023, + "balance_loss_mlp": 1.0744772, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.08021572729531513, + "language_loss": 0.87206727, + "learning_rate": 0.0009605265392510703, + "loss": 0.88314748, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.33544922, + "step": 798, + "time_per_iteration": 2.6084530353546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097421, + "balance_loss_mlp": 1.065521, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.06650858832922667, + "language_loss": 0.91961598, + "learning_rate": 0.0009604051237403846, + "loss": 0.93059021, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.31884766, + "step": 799, + "time_per_iteration": 2.629930019378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111802, + "balance_loss_mlp": 1.07951975, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.12724142526344331, + "language_loss": 0.85673767, + "learning_rate": 0.0009602835294845776, + "loss": 0.86785567, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.32275391, + "step": 800, + "time_per_iteration": 2.4388976097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116786, + "balance_loss_mlp": 1.08374119, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.06962057985754792, + "language_loss": 0.9036696, + "learning_rate": 0.0009601617565308565, + "loss": 0.91483742, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.33056641, + "step": 801, + "time_per_iteration": 2.6220815181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112115, + "balance_loss_mlp": 1.08829629, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.07662224573984003, + "language_loss": 0.86584908, + "learning_rate": 0.0009600398049264977, + "loss": 0.87706065, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.32861328, + "step": 802, + "time_per_iteration": 2.9767894744873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122446, + "balance_loss_mlp": 1.08870947, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.07007784052810237, + "language_loss": 0.91261709, + "learning_rate": 0.0009599176747188469, + "loss": 0.9238416, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.33764648, + "step": 803, + "time_per_iteration": 2.8329989910125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105402, + "balance_loss_mlp": 1.07242846, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.06284855896117353, + "language_loss": 0.82565022, + "learning_rate": 0.0009597953659553196, + "loss": 0.83670425, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.32983398, + "step": 804, + "time_per_iteration": 2.6918182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101021, + "balance_loss_mlp": 1.06814265, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.06479523616705579, + "language_loss": 0.88566583, + "learning_rate": 0.0009596728786833997, + "loss": 0.89667606, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32885742, + "step": 805, + "time_per_iteration": 2.609287977218628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110234, + "balance_loss_mlp": 1.06829393, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.07111390229237131, + "language_loss": 0.89488924, + "learning_rate": 0.0009595502129506415, + "loss": 0.90591264, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.34082031, + "step": 806, + "time_per_iteration": 3.403404951095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096653, + "balance_loss_mlp": 1.0634892, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.08216570532607727, + "language_loss": 0.82236785, + "learning_rate": 0.0009594273688046678, + "loss": 0.83333433, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33178711, + "step": 807, + "time_per_iteration": 2.7215962409973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093065, + "balance_loss_mlp": 1.05968678, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.06904253720821768, + "language_loss": 0.85279024, + "learning_rate": 0.000959304346293171, + "loss": 0.86372089, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.33398438, + "step": 808, + "time_per_iteration": 2.6801698207855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098465, + "balance_loss_mlp": 1.06661189, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.09111957868284204, + "language_loss": 0.87858826, + "learning_rate": 0.0009591811454639125, + "loss": 0.88957286, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.31835938, + "step": 809, + "time_per_iteration": 2.7565882205963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094758, + "balance_loss_mlp": 1.06187963, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06649225570292959, + "language_loss": 0.87746191, + "learning_rate": 0.0009590577663647234, + "loss": 0.8884095, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.32885742, + "step": 810, + "time_per_iteration": 2.7346410751342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.07233548, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.0619187082363415, + "language_loss": 0.85968214, + "learning_rate": 0.0009589342090435036, + "loss": 0.87074351, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33837891, + "step": 811, + "time_per_iteration": 2.771869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114592, + "balance_loss_mlp": 1.08226287, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.07419416671079432, + "language_loss": 0.87060148, + "learning_rate": 0.0009588104735482223, + "loss": 0.88174742, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.32324219, + "step": 812, + "time_per_iteration": 2.6792666912078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122998, + "balance_loss_mlp": 1.09007227, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08530784328603107, + "language_loss": 0.83981705, + "learning_rate": 0.0009586865599269177, + "loss": 0.85104704, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.3293457, + "step": 813, + "time_per_iteration": 2.6273813247680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122852, + "balance_loss_mlp": 1.09109521, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.09596754940168085, + "language_loss": 0.88191104, + "learning_rate": 0.0009585624682276977, + "loss": 0.8931396, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.31738281, + "step": 814, + "time_per_iteration": 2.7389183044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114804, + "balance_loss_mlp": 1.08361948, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.07403121037751308, + "language_loss": 0.87196732, + "learning_rate": 0.0009584381984987386, + "loss": 0.88311541, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.31152344, + "step": 815, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118789, + "balance_loss_mlp": 1.0867933, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05796420471157715, + "language_loss": 0.89563668, + "learning_rate": 0.0009583137507882864, + "loss": 0.90682459, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.31982422, + "step": 816, + "time_per_iteration": 2.6771223545074463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120947, + "balance_loss_mlp": 1.08945227, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.06695321751464198, + "language_loss": 0.80875123, + "learning_rate": 0.000958189125144656, + "loss": 0.81996059, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.31469727, + "step": 817, + "time_per_iteration": 2.648407220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142778, + "balance_loss_mlp": 1.11125922, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.07474790639920047, + "language_loss": 0.87800574, + "learning_rate": 0.0009580643216162313, + "loss": 0.8894335, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.31494141, + "step": 818, + "time_per_iteration": 2.663799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140784, + "balance_loss_mlp": 1.10940814, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.10531827445817923, + "language_loss": 0.79636216, + "learning_rate": 0.0009579393402514652, + "loss": 0.80777001, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.31347656, + "step": 819, + "time_per_iteration": 2.5795977115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128459, + "balance_loss_mlp": 1.09617746, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06561760213255555, + "language_loss": 0.90222132, + "learning_rate": 0.0009578141810988801, + "loss": 0.91350597, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.32275391, + "step": 820, + "time_per_iteration": 2.6019015312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120432, + "balance_loss_mlp": 1.08807814, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.07003821866302876, + "language_loss": 0.90498698, + "learning_rate": 0.0009576888442070668, + "loss": 0.91619134, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.32348633, + "step": 821, + "time_per_iteration": 2.5933666229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109361, + "balance_loss_mlp": 1.07614923, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06959801001512317, + "language_loss": 0.92461467, + "learning_rate": 0.0009575633296246854, + "loss": 0.93570817, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.33227539, + "step": 822, + "time_per_iteration": 2.584195375442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104383, + "balance_loss_mlp": 1.07198191, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.0738821286657961, + "language_loss": 0.82797432, + "learning_rate": 0.0009574376374004652, + "loss": 0.83901811, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.32397461, + "step": 823, + "time_per_iteration": 2.6445696353912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099566, + "balance_loss_mlp": 1.0669024, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07930768625104477, + "language_loss": 0.8015238, + "learning_rate": 0.000957311767583204, + "loss": 0.81251943, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.32666016, + "step": 824, + "time_per_iteration": 2.590190887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284074, + "balance_loss_mlp": 1.26194882, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.06857459467376774, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83355665, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.22167969, + "step": 825, + "time_per_iteration": 4.729644060134888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091191, + "balance_loss_mlp": 1.05766964, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.10530356830759573, + "language_loss": 0.91383988, + "learning_rate": 0.0009570594953650961, + "loss": 0.92475176, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.33544922, + "step": 826, + "time_per_iteration": 2.5222439765930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099421, + "balance_loss_mlp": 1.06580353, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.07312615216486826, + "language_loss": 0.80215907, + "learning_rate": 0.00095693309306219, + "loss": 0.81315327, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.33642578, + "step": 827, + "time_per_iteration": 3.104602098464966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091547, + "balance_loss_mlp": 1.0577873, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.06629059991756085, + "language_loss": 0.87921345, + "learning_rate": 0.0009568065133621244, + "loss": 0.89012897, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.33789062, + "step": 828, + "time_per_iteration": 3.349937915802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088059, + "balance_loss_mlp": 1.05324984, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.06785059542129762, + "language_loss": 0.84638405, + "learning_rate": 0.0009566797563140422, + "loss": 0.85726464, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.34863281, + "step": 829, + "time_per_iteration": 2.883561849594116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096047, + "balance_loss_mlp": 1.06085658, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06369088806732512, + "language_loss": 0.87693489, + "learning_rate": 0.0009565528219671547, + "loss": 0.88789535, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.35229492, + "step": 830, + "time_per_iteration": 2.929800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098337, + "balance_loss_mlp": 1.06412435, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.06081537703934319, + "language_loss": 0.84958434, + "learning_rate": 0.0009564257103707418, + "loss": 0.86056769, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.3425293, + "step": 831, + "time_per_iteration": 2.631542444229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105374, + "balance_loss_mlp": 1.0715903, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.06950481232518824, + "language_loss": 0.91362834, + "learning_rate": 0.0009562984215741533, + "loss": 0.92468208, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.33789062, + "step": 832, + "time_per_iteration": 2.669194459915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093997, + "balance_loss_mlp": 1.05973649, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.06093058452920847, + "language_loss": 0.82276815, + "learning_rate": 0.0009561709556268065, + "loss": 0.83370817, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.34301758, + "step": 833, + "time_per_iteration": 2.747171401977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096504, + "balance_loss_mlp": 1.06298196, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.09598386402958035, + "language_loss": 0.93858409, + "learning_rate": 0.0009560433125781884, + "loss": 0.9495492, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.33544922, + "step": 834, + "time_per_iteration": 2.7381722927093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090718, + "balance_loss_mlp": 1.05645716, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06748577773497036, + "language_loss": 0.92278147, + "learning_rate": 0.0009559154924778544, + "loss": 0.93368864, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.34301758, + "step": 835, + "time_per_iteration": 2.7790255546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079826, + "balance_loss_mlp": 1.04625726, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.07378429569225692, + "language_loss": 0.85029173, + "learning_rate": 0.0009557874953754284, + "loss": 0.86109, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.33569336, + "step": 836, + "time_per_iteration": 3.0223195552825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082807, + "balance_loss_mlp": 1.04883218, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.08025480036652383, + "language_loss": 0.83386606, + "learning_rate": 0.0009556593213206038, + "loss": 0.84469414, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34008789, + "step": 837, + "time_per_iteration": 2.7436904907226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.0690426934286745, + "language_loss": 0.87355983, + "learning_rate": 0.0009555309703631414, + "loss": 0.88442945, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33813477, + "step": 838, + "time_per_iteration": 2.6828954219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097306, + "balance_loss_mlp": 1.06364167, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07092577785176474, + "language_loss": 0.87526888, + "learning_rate": 0.0009554024425528722, + "loss": 0.88624191, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.33691406, + "step": 839, + "time_per_iteration": 2.6739652156829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110874, + "balance_loss_mlp": 1.07797241, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.09046955561085915, + "language_loss": 0.88719451, + "learning_rate": 0.0009552737379396948, + "loss": 0.89830327, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32910156, + "step": 840, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110692, + "balance_loss_mlp": 1.07757533, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06735134703819705, + "language_loss": 0.88063818, + "learning_rate": 0.0009551448565735767, + "loss": 0.89174509, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33129883, + "step": 841, + "time_per_iteration": 2.741941452026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121097, + "balance_loss_mlp": 1.08790874, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.06426805463858033, + "language_loss": 0.84472924, + "learning_rate": 0.0009550157985045543, + "loss": 0.85594022, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.33203125, + "step": 842, + "time_per_iteration": 3.045841693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102839, + "balance_loss_mlp": 1.07041371, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.06545460719380305, + "language_loss": 0.89229876, + "learning_rate": 0.0009548865637827321, + "loss": 0.90332717, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.32421875, + "step": 843, + "time_per_iteration": 2.6820054054260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100372, + "balance_loss_mlp": 1.06701708, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.09211303705947127, + "language_loss": 0.89927554, + "learning_rate": 0.0009547571524582838, + "loss": 0.91027921, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33374023, + "step": 844, + "time_per_iteration": 2.592280149459839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097994, + "balance_loss_mlp": 1.06587958, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.07125004392928289, + "language_loss": 0.91891497, + "learning_rate": 0.0009546275645814512, + "loss": 0.92989492, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.32104492, + "step": 845, + "time_per_iteration": 2.6273765563964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097571, + "balance_loss_mlp": 1.06531262, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07293740056217544, + "language_loss": 0.89635444, + "learning_rate": 0.0009544978002025446, + "loss": 0.90733016, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.32250977, + "step": 846, + "time_per_iteration": 2.5906271934509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.05821955, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.052168896342380144, + "language_loss": 0.86807543, + "learning_rate": 0.0009543678593719434, + "loss": 0.8789919, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.33447266, + "step": 847, + "time_per_iteration": 2.747469186782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098148, + "balance_loss_mlp": 1.06510353, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.05056297173362441, + "language_loss": 0.87167078, + "learning_rate": 0.0009542377421400945, + "loss": 0.88265228, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.33056641, + "step": 848, + "time_per_iteration": 2.7777974605560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.06950974, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06627324615029867, + "language_loss": 0.83542728, + "learning_rate": 0.0009541074485575145, + "loss": 0.84645092, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.32861328, + "step": 849, + "time_per_iteration": 2.7575085163116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105099, + "balance_loss_mlp": 1.07288873, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.05751037996071174, + "language_loss": 0.9190414, + "learning_rate": 0.0009539769786747874, + "loss": 0.93009233, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.32202148, + "step": 850, + "time_per_iteration": 2.6389074325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109592, + "balance_loss_mlp": 1.06261301, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.07235435681682932, + "language_loss": 0.81106341, + "learning_rate": 0.0009538463325425665, + "loss": 0.82202262, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.33325195, + "step": 851, + "time_per_iteration": 2.7013468742370605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109998, + "balance_loss_mlp": 1.06695926, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.07286475265539226, + "language_loss": 0.86075503, + "learning_rate": 0.0009537155102115728, + "loss": 0.87175477, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.33032227, + "step": 852, + "time_per_iteration": 2.5927765369415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089138, + "balance_loss_mlp": 1.05668926, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.07079739805294577, + "language_loss": 0.83340597, + "learning_rate": 0.0009535845117325961, + "loss": 0.84429741, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.32446289, + "step": 853, + "time_per_iteration": 2.6400251388549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05780828, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.055390341552487656, + "language_loss": 0.93137228, + "learning_rate": 0.0009534533371564946, + "loss": 0.9422797, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.3293457, + "step": 854, + "time_per_iteration": 2.794569492340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097604, + "balance_loss_mlp": 1.06424975, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.07789269087805807, + "language_loss": 0.88390946, + "learning_rate": 0.0009533219865341949, + "loss": 0.89488548, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.33374023, + "step": 855, + "time_per_iteration": 2.5882935523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110169, + "balance_loss_mlp": 1.07721937, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.07176827599451206, + "language_loss": 0.85993397, + "learning_rate": 0.0009531904599166916, + "loss": 0.87103564, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.32958984, + "step": 856, + "time_per_iteration": 2.6384060382843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.07585454, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.08966352124388614, + "language_loss": 0.84823519, + "learning_rate": 0.0009530587573550478, + "loss": 0.85932112, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.32739258, + "step": 857, + "time_per_iteration": 2.6009740829467773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139199, + "balance_loss_mlp": 1.11554801, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.0480168233011906, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75458586, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.23632812, + "step": 858, + "time_per_iteration": 5.006503105163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_mlp": 1.07712269, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.08332018813054971, + "language_loss": 0.89907712, + "learning_rate": 0.0009527948246039337, + "loss": 0.91015732, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.30859375, + "step": 859, + "time_per_iteration": 2.5502097606658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113676, + "balance_loss_mlp": 1.08313441, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.06488618871597049, + "language_loss": 0.87213862, + "learning_rate": 0.000952662594516931, + "loss": 0.88327539, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.30493164, + "step": 860, + "time_per_iteration": 3.091632604598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112348, + "balance_loss_mlp": 1.08059049, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.18119016536128274, + "language_loss": 0.86193782, + "learning_rate": 0.0009525301886907234, + "loss": 0.8730613, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.31738281, + "step": 861, + "time_per_iteration": 2.8586955070495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115106, + "balance_loss_mlp": 1.08372974, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.06494583254435107, + "language_loss": 0.87565315, + "learning_rate": 0.0009523976071767155, + "loss": 0.88680422, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.31347656, + "step": 862, + "time_per_iteration": 2.6474006175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113562, + "balance_loss_mlp": 1.08228135, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05844730537287504, + "language_loss": 0.87850058, + "learning_rate": 0.00095226485002638, + "loss": 0.88963622, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.3125, + "step": 863, + "time_per_iteration": 2.7738211154937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101227, + "balance_loss_mlp": 1.06894565, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05720313452307963, + "language_loss": 0.88969022, + "learning_rate": 0.0009521319172912576, + "loss": 0.90070248, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.32275391, + "step": 864, + "time_per_iteration": 2.762932538986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108698, + "balance_loss_mlp": 1.07624936, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.0631928299213439, + "language_loss": 0.94547617, + "learning_rate": 0.0009519988090229579, + "loss": 0.95656317, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.32446289, + "step": 865, + "time_per_iteration": 2.672088384628296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105777, + "balance_loss_mlp": 1.07332826, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.06928181027356142, + "language_loss": 0.87572587, + "learning_rate": 0.0009518655252731576, + "loss": 0.8867836, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.32446289, + "step": 866, + "time_per_iteration": 2.754418134689331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_mlp": 1.07049167, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.059497633162238536, + "language_loss": 0.90014684, + "learning_rate": 0.0009517320660936022, + "loss": 0.91118789, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.33642578, + "step": 867, + "time_per_iteration": 2.73579478263855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103888, + "balance_loss_mlp": 1.07117677, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.06138762269806642, + "language_loss": 0.82812411, + "learning_rate": 0.0009515984315361051, + "loss": 0.83916301, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.32714844, + "step": 868, + "time_per_iteration": 2.7929019927978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102635, + "balance_loss_mlp": 1.07016206, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.07711570113555911, + "language_loss": 0.8657794, + "learning_rate": 0.000951464621652548, + "loss": 0.87680572, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.32470703, + "step": 869, + "time_per_iteration": 2.6135518550872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105976, + "balance_loss_mlp": 1.07381344, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.07032317085354448, + "language_loss": 0.78791183, + "learning_rate": 0.0009513306364948804, + "loss": 0.79897159, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.3215332, + "step": 870, + "time_per_iteration": 2.7745420932769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101249, + "balance_loss_mlp": 1.06949186, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.0706094790942469, + "language_loss": 0.88557035, + "learning_rate": 0.0009511964761151197, + "loss": 0.89658284, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.31738281, + "step": 871, + "time_per_iteration": 2.570082902908325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112728, + "balance_loss_mlp": 1.08147156, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06741449701936619, + "language_loss": 0.90011156, + "learning_rate": 0.0009510621405653521, + "loss": 0.91123885, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.31225586, + "step": 872, + "time_per_iteration": 2.5378525257110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098432, + "balance_loss_mlp": 1.06860542, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.07031527693840728, + "language_loss": 0.8401826, + "learning_rate": 0.0009509276298977309, + "loss": 0.85116696, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.29760742, + "step": 873, + "time_per_iteration": 2.9614696502685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102853, + "balance_loss_mlp": 1.07121444, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.07037881289732177, + "language_loss": 0.8146044, + "learning_rate": 0.0009507929441644778, + "loss": 0.82563293, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.31616211, + "step": 874, + "time_per_iteration": 3.5029537677764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105025, + "balance_loss_mlp": 1.07403064, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.07204378854359271, + "language_loss": 0.8568964, + "learning_rate": 0.0009506580834178826, + "loss": 0.86794662, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.30957031, + "step": 875, + "time_per_iteration": 2.738445281982422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105989, + "balance_loss_mlp": 1.07420754, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06279104396907492, + "language_loss": 0.91300583, + "learning_rate": 0.0009505230477103028, + "loss": 0.92406577, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.31762695, + "step": 876, + "time_per_iteration": 2.7304844856262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121558, + "balance_loss_mlp": 1.0900147, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.07749651336428325, + "language_loss": 0.81126654, + "learning_rate": 0.0009503878370941641, + "loss": 0.82248211, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.31518555, + "step": 877, + "time_per_iteration": 2.7332048416137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121651, + "balance_loss_mlp": 1.09063232, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.08158970109830238, + "language_loss": 0.88660848, + "learning_rate": 0.0009502524516219595, + "loss": 0.897825, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.30981445, + "step": 878, + "time_per_iteration": 2.810194730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120277, + "balance_loss_mlp": 1.08942604, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.08439254905993104, + "language_loss": 0.89592326, + "learning_rate": 0.0009501168913462506, + "loss": 0.90712607, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.30810547, + "step": 879, + "time_per_iteration": 2.6550278663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181395, + "balance_loss_mlp": 1.15822113, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.05511344701971209, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80303323, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.23144531, + "step": 880, + "time_per_iteration": 4.798918962478638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.08894515, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.05479331137197536, + "language_loss": 0.85038209, + "learning_rate": 0.0009498452465949042, + "loss": 0.86158419, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.31225586, + "step": 881, + "time_per_iteration": 3.2795042991638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08295763, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.06005284109203957, + "language_loss": 0.91010857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92124879, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.31030273, + "step": 882, + "time_per_iteration": 2.741497755050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114536, + "balance_loss_mlp": 1.0833751, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08668021784836823, + "language_loss": 0.9325586, + "learning_rate": 0.0009495729032619723, + "loss": 0.94370389, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.3112793, + "step": 883, + "time_per_iteration": 2.6621923446655273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102424, + "balance_loss_mlp": 1.07035685, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06301404020698688, + "language_loss": 0.84119958, + "learning_rate": 0.0009494364697595354, + "loss": 0.85222387, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.32055664, + "step": 884, + "time_per_iteration": 2.8904953002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102364, + "balance_loss_mlp": 1.07022548, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06367673921209963, + "language_loss": 0.89062482, + "learning_rate": 0.0009492998617703867, + "loss": 0.9016484, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.32128906, + "step": 885, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088714, + "balance_loss_mlp": 1.05779076, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.06771442044112419, + "language_loss": 0.87296236, + "learning_rate": 0.0009491630793475619, + "loss": 0.88384956, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.30908203, + "step": 886, + "time_per_iteration": 2.601238965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095605, + "balance_loss_mlp": 1.06346607, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.064396115452368, + "language_loss": 0.85120332, + "learning_rate": 0.0009490261225441643, + "loss": 0.86215937, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.32128906, + "step": 887, + "time_per_iteration": 2.865694999694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089924, + "balance_loss_mlp": 1.05797613, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.06834327453619109, + "language_loss": 0.90091348, + "learning_rate": 0.0009488889914133656, + "loss": 0.91181278, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.31933594, + "step": 888, + "time_per_iteration": 3.0129144191741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092844, + "balance_loss_mlp": 1.06077635, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.06591248507341309, + "language_loss": 0.88667148, + "learning_rate": 0.0009487516860084047, + "loss": 0.89759994, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.32055664, + "step": 889, + "time_per_iteration": 2.738736867904663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087561, + "balance_loss_mlp": 1.05644727, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.07350534216298948, + "language_loss": 0.88845301, + "learning_rate": 0.0009486142063825884, + "loss": 0.89932865, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.31079102, + "step": 890, + "time_per_iteration": 2.5697011947631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117181, + "balance_loss_mlp": 1.15197396, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.0550236747402086, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73598027, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.19824219, + "step": 891, + "time_per_iteration": 4.955617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092709, + "balance_loss_mlp": 1.06119013, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06911805131577382, + "language_loss": 0.9061746, + "learning_rate": 0.0009483387246819542, + "loss": 0.91710162, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.31494141, + "step": 892, + "time_per_iteration": 2.725799798965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10153532, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.032113973586073014, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83406758, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.19628906, + "step": 893, + "time_per_iteration": 4.664165735244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089705, + "balance_loss_mlp": 1.05813849, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.0574582553480054, + "language_loss": 0.89272118, + "learning_rate": 0.0009480625467392688, + "loss": 0.90361822, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.31542969, + "step": 894, + "time_per_iteration": 2.637554883956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08910024, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.027611634873128267, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79104185, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.20410156, + "step": 895, + "time_per_iteration": 4.76848030090332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.05822968, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05350045539937067, + "language_loss": 0.87532026, + "learning_rate": 0.0009477856729834196, + "loss": 0.88620532, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.30249023, + "step": 896, + "time_per_iteration": 2.7219061851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093646, + "balance_loss_mlp": 1.06267512, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.06021872133739316, + "language_loss": 0.89942896, + "learning_rate": 0.0009476469753098809, + "loss": 0.9103654, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.30932617, + "step": 897, + "time_per_iteration": 2.6990017890930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109486, + "balance_loss_mlp": 1.06398487, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.072864012804074, + "language_loss": 0.86893761, + "learning_rate": 0.0009475081038443738, + "loss": 0.87988615, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.30834961, + "step": 898, + "time_per_iteration": 2.5972931385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091914, + "balance_loss_mlp": 1.06030011, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.07073516416365672, + "language_loss": 0.85445154, + "learning_rate": 0.0009473690586408124, + "loss": 0.86537069, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.31591797, + "step": 899, + "time_per_iteration": 2.821336507797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085642, + "balance_loss_mlp": 1.05421829, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.061416888012907525, + "language_loss": 0.86083823, + "learning_rate": 0.0009472298397531792, + "loss": 0.87169468, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.31396484, + "step": 900, + "time_per_iteration": 2.7345612049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090461, + "balance_loss_mlp": 1.058918, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.060849230911096945, + "language_loss": 0.86217213, + "learning_rate": 0.0009470904472355235, + "loss": 0.87307668, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.31518555, + "step": 901, + "time_per_iteration": 2.637425661087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089284, + "balance_loss_mlp": 1.05755067, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.07830588235472731, + "language_loss": 0.79847336, + "learning_rate": 0.0009469508811419626, + "loss": 0.80936623, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.31713867, + "step": 902, + "time_per_iteration": 2.70833683013916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149006, + "balance_loss_mlp": 1.12678576, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.05917050619752012, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72762835, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.22265625, + "step": 903, + "time_per_iteration": 4.776138782501221 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088102, + "balance_loss_mlp": 1.05562961, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.07262085456902109, + "language_loss": 0.83503735, + "learning_rate": 0.0009466712284439292, + "loss": 0.84591836, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.32470703, + "step": 904, + "time_per_iteration": 2.7372140884399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086515, + "balance_loss_mlp": 1.05385172, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.09192064511302059, + "language_loss": 0.88356638, + "learning_rate": 0.0009465311419480276, + "loss": 0.89443153, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.32666016, + "step": 905, + "time_per_iteration": 2.677032470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109277, + "balance_loss_mlp": 1.06036901, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.07898220644020008, + "language_loss": 0.88434756, + "learning_rate": 0.0009463908820933622, + "loss": 0.89527524, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.32397461, + "step": 906, + "time_per_iteration": 2.8139841556549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097047, + "balance_loss_mlp": 1.06505144, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.0868003192310251, + "language_loss": 0.82122958, + "learning_rate": 0.0009462504489343868, + "loss": 0.83220005, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.31982422, + "step": 907, + "time_per_iteration": 2.8445968627929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.07106495, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.09920963499058721, + "language_loss": 0.88653374, + "learning_rate": 0.0009461098425256222, + "loss": 0.89756691, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.32250977, + "step": 908, + "time_per_iteration": 2.5947494506835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109512, + "balance_loss_mlp": 1.07784963, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.09355765751058653, + "language_loss": 0.86340624, + "learning_rate": 0.0009459690629216567, + "loss": 0.87450135, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.31640625, + "step": 909, + "time_per_iteration": 2.621044874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112453, + "balance_loss_mlp": 1.08155417, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.07034154505215827, + "language_loss": 0.8701601, + "learning_rate": 0.0009458281101771457, + "loss": 0.88128459, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.30859375, + "step": 910, + "time_per_iteration": 2.674091100692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115198, + "balance_loss_mlp": 1.08508539, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.09036058743894539, + "language_loss": 0.82642829, + "learning_rate": 0.0009456869843468122, + "loss": 0.83758032, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.30053711, + "step": 911, + "time_per_iteration": 2.830397605895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105257, + "balance_loss_mlp": 1.07378554, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.0879185530474863, + "language_loss": 0.78465313, + "learning_rate": 0.0009455456854854459, + "loss": 0.79570568, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.31445312, + "step": 912, + "time_per_iteration": 2.621293067932129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102771, + "balance_loss_mlp": 1.07196748, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.0647038307980506, + "language_loss": 0.8401655, + "learning_rate": 0.0009454042136479039, + "loss": 0.85119313, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.30786133, + "step": 913, + "time_per_iteration": 2.5675978660583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095649, + "balance_loss_mlp": 1.0655843, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.06520052548040499, + "language_loss": 0.82717437, + "learning_rate": 0.0009452625688891103, + "loss": 0.83813089, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.30004883, + "step": 914, + "time_per_iteration": 2.5443828105926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156407, + "balance_loss_mlp": 1.13332844, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.06121421634548094, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79891145, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.23046875, + "step": 915, + "time_per_iteration": 4.5826005935668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117447, + "balance_loss_mlp": 1.08642912, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.07309570223890104, + "language_loss": 0.93135887, + "learning_rate": 0.0009449787608278015, + "loss": 0.94253331, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.30981445, + "step": 916, + "time_per_iteration": 2.7787418365478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120355, + "balance_loss_mlp": 1.08926511, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.10226900865330964, + "language_loss": 0.92397296, + "learning_rate": 0.0009448365976354704, + "loss": 0.93517655, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.31054688, + "step": 917, + "time_per_iteration": 2.5531399250030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124705, + "balance_loss_mlp": 1.09247112, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07454694115091837, + "language_loss": 0.89785659, + "learning_rate": 0.0009446942617422558, + "loss": 0.90910363, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.32226562, + "step": 918, + "time_per_iteration": 2.583489418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123972, + "balance_loss_mlp": 1.09250093, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.06638545773718021, + "language_loss": 0.85658622, + "learning_rate": 0.0009445517532034176, + "loss": 0.86782598, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.31445312, + "step": 919, + "time_per_iteration": 2.6830263137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122334, + "balance_loss_mlp": 1.09107733, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.09547651267352689, + "language_loss": 0.88907313, + "learning_rate": 0.0009444090720742824, + "loss": 0.90029645, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.31225586, + "step": 920, + "time_per_iteration": 2.5984437465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123289, + "balance_loss_mlp": 1.09181738, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.10483808909193337, + "language_loss": 0.87128365, + "learning_rate": 0.0009442662184102439, + "loss": 0.8825165, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.31445312, + "step": 921, + "time_per_iteration": 2.772568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097725, + "balance_loss_mlp": 1.06737399, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.057071439682559955, + "language_loss": 0.87210095, + "learning_rate": 0.000944123192266763, + "loss": 0.88307822, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.30297852, + "step": 922, + "time_per_iteration": 2.8091742992401123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122306, + "balance_loss_mlp": 1.09004784, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.07267069192247201, + "language_loss": 0.83557594, + "learning_rate": 0.0009439799936993671, + "loss": 0.84679902, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.32250977, + "step": 923, + "time_per_iteration": 2.7226145267486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147891, + "balance_loss_mlp": 1.11494136, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.14883746036090706, + "language_loss": 0.88219315, + "learning_rate": 0.0009438366227636511, + "loss": 0.89367205, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.32958984, + "step": 924, + "time_per_iteration": 2.6409950256347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121758, + "balance_loss_mlp": 1.08950043, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.07347120708699749, + "language_loss": 0.85914218, + "learning_rate": 0.0009436930795152763, + "loss": 0.87035978, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.32250977, + "step": 925, + "time_per_iteration": 2.811595916748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.07419825, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.07224950530739313, + "language_loss": 0.86246336, + "learning_rate": 0.0009435493640099713, + "loss": 0.87353885, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.33374023, + "step": 926, + "time_per_iteration": 2.775090456008911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098588, + "balance_loss_mlp": 1.06513751, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06608942550370576, + "language_loss": 0.83981788, + "learning_rate": 0.0009434054763035314, + "loss": 0.85080379, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.3347168, + "step": 927, + "time_per_iteration": 2.621683359146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089168, + "balance_loss_mlp": 1.05559874, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.06566794669431841, + "language_loss": 0.85671836, + "learning_rate": 0.0009432614164518185, + "loss": 0.86761004, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33569336, + "step": 928, + "time_per_iteration": 3.011759042739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108639, + "balance_loss_mlp": 1.05320191, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.06622036101375141, + "language_loss": 0.84125841, + "learning_rate": 0.000943117184510762, + "loss": 0.85212231, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.33203125, + "step": 929, + "time_per_iteration": 2.9782960414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166041, + "balance_loss_mlp": 1.14010072, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.044814265222739694, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79956007, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.25976562, + "step": 930, + "time_per_iteration": 5.011061906814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086461, + "balance_loss_mlp": 1.0529635, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.09835801245739735, + "language_loss": 0.88482547, + "learning_rate": 0.0009428282045846674, + "loss": 0.89569014, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.33520508, + "step": 931, + "time_per_iteration": 2.700901508331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084257, + "balance_loss_mlp": 1.04899526, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.0790312068568768, + "language_loss": 0.88828444, + "learning_rate": 0.0009426834567118214, + "loss": 0.89912701, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.3527832, + "step": 932, + "time_per_iteration": 3.0847127437591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108852, + "balance_loss_mlp": 1.05557072, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.05851377965258845, + "language_loss": 0.80669105, + "learning_rate": 0.0009425385369740155, + "loss": 0.81757629, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.32958984, + "step": 933, + "time_per_iteration": 3.0405056476593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089624, + "balance_loss_mlp": 1.05517268, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.08098153489662575, + "language_loss": 0.86808264, + "learning_rate": 0.0009423934454275125, + "loss": 0.87897891, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.34472656, + "step": 934, + "time_per_iteration": 2.832589626312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090759, + "balance_loss_mlp": 1.05683184, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.0889712704970151, + "language_loss": 0.91607213, + "learning_rate": 0.0009422481821286418, + "loss": 0.92697972, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33935547, + "step": 935, + "time_per_iteration": 2.739004611968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099753, + "balance_loss_mlp": 1.06589735, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.11621731552094582, + "language_loss": 0.87764728, + "learning_rate": 0.0009421027471337998, + "loss": 0.88864481, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.33886719, + "step": 936, + "time_per_iteration": 2.663978099822998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093554, + "balance_loss_mlp": 1.06100953, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.08193839025260119, + "language_loss": 0.8197844, + "learning_rate": 0.0009419571404994493, + "loss": 0.83071995, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.32543945, + "step": 937, + "time_per_iteration": 2.680880308151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087323, + "balance_loss_mlp": 1.05427766, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08083617156557357, + "language_loss": 0.90250957, + "learning_rate": 0.00094181136228212, + "loss": 0.91338283, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33056641, + "step": 938, + "time_per_iteration": 2.635734796524048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083703, + "balance_loss_mlp": 1.05134988, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.0738614516115471, + "language_loss": 0.85650909, + "learning_rate": 0.0009416654125384077, + "loss": 0.86734617, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.32348633, + "step": 939, + "time_per_iteration": 2.713120460510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092435, + "balance_loss_mlp": 1.06744874, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.04310930319536216, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80864811, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.25, + "step": 940, + "time_per_iteration": 4.928712606430054 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084719, + "balance_loss_mlp": 1.05372477, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.06379600043785322, + "language_loss": 0.83724225, + "learning_rate": 0.000941372998698552, + "loss": 0.84808946, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.30957031, + "step": 941, + "time_per_iteration": 2.9594616889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0600785, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.07993905082854055, + "language_loss": 0.81844771, + "learning_rate": 0.0009412265347159336, + "loss": 0.82936954, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.32104492, + "step": 942, + "time_per_iteration": 2.705883741378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089966, + "balance_loss_mlp": 1.05847049, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.08204750484488939, + "language_loss": 0.84816301, + "learning_rate": 0.0009410798994339829, + "loss": 0.85906267, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.31469727, + "step": 943, + "time_per_iteration": 2.606898546218872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086355, + "balance_loss_mlp": 1.0538584, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.06564936273566103, + "language_loss": 0.88176167, + "learning_rate": 0.000940933092909628, + "loss": 0.89262521, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.32495117, + "step": 944, + "time_per_iteration": 2.568862199783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.058375, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.06967818448900699, + "language_loss": 0.83546078, + "learning_rate": 0.0009407861151998649, + "loss": 0.84635758, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.31274414, + "step": 945, + "time_per_iteration": 2.5752463340759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108574, + "balance_loss_mlp": 1.05338621, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.07045774982796042, + "language_loss": 0.86168265, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254012, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.32348633, + "step": 946, + "time_per_iteration": 2.6582529544830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084969, + "balance_loss_mlp": 1.05299747, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.08074656744529311, + "language_loss": 0.8540619, + "learning_rate": 0.000940491646452427, + "loss": 0.86491156, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.31958008, + "step": 947, + "time_per_iteration": 2.7117488384246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080922, + "balance_loss_mlp": 1.04914129, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.0614528539730692, + "language_loss": 0.90478814, + "learning_rate": 0.000940344155529075, + "loss": 0.91559744, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.31762695, + "step": 948, + "time_per_iteration": 2.675457239151001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086239, + "balance_loss_mlp": 1.05472016, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06480396750006864, + "language_loss": 0.8689037, + "learning_rate": 0.0009401964936489605, + "loss": 0.87976611, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.31494141, + "step": 949, + "time_per_iteration": 2.5517518520355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.05446136, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07386346522147075, + "language_loss": 0.84915626, + "learning_rate": 0.0009400486608694108, + "loss": 0.86001992, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31884766, + "step": 950, + "time_per_iteration": 2.744371175765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089338, + "balance_loss_mlp": 1.05769992, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.07193745080732644, + "language_loss": 0.86961377, + "learning_rate": 0.0009399006572478195, + "loss": 0.88050711, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.31616211, + "step": 951, + "time_per_iteration": 3.0956904888153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108625, + "balance_loss_mlp": 1.05427814, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.06892976413128309, + "language_loss": 0.90901303, + "learning_rate": 0.0009397524828416468, + "loss": 0.9198755, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.31958008, + "step": 952, + "time_per_iteration": 2.7130446434020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.06179333, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.06752223069443862, + "language_loss": 0.96249408, + "learning_rate": 0.0009396041377084192, + "loss": 0.97342908, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.31689453, + "step": 953, + "time_per_iteration": 2.66972279548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101927, + "balance_loss_mlp": 1.07043195, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07502219242723109, + "language_loss": 0.87290752, + "learning_rate": 0.0009394556219057295, + "loss": 0.88392681, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.31469727, + "step": 954, + "time_per_iteration": 2.659264326095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109814, + "balance_loss_mlp": 1.07810426, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.08651848853121004, + "language_loss": 0.8329587, + "learning_rate": 0.0009393069354912362, + "loss": 0.84405684, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.31689453, + "step": 955, + "time_per_iteration": 2.77437686920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111165, + "balance_loss_mlp": 1.080266, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07817657388257933, + "language_loss": 0.82119787, + "learning_rate": 0.0009391580785226649, + "loss": 0.83230954, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.30859375, + "step": 956, + "time_per_iteration": 2.867492437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094598, + "balance_loss_mlp": 1.06903911, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.05003344342080426, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.8043505, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.25585938, + "step": 957, + "time_per_iteration": 4.762399196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_mlp": 1.07757246, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.06311489935861506, + "language_loss": 0.86409998, + "learning_rate": 0.0009388598531545196, + "loss": 0.87518233, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.30615234, + "step": 958, + "time_per_iteration": 2.8768551349639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102291, + "balance_loss_mlp": 1.07160664, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07254101069499316, + "language_loss": 0.85046387, + "learning_rate": 0.000938710484870727, + "loss": 0.86148679, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.30639648, + "step": 959, + "time_per_iteration": 2.569592237472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123147, + "balance_loss_mlp": 1.09262919, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.07612110690317586, + "language_loss": 0.85695219, + "learning_rate": 0.0009385609462644189, + "loss": 0.86818361, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.3046875, + "step": 960, + "time_per_iteration": 2.6880924701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_mlp": 1.09682918, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.08874671943740564, + "language_loss": 0.85532272, + "learning_rate": 0.0009384112373936514, + "loss": 0.86659384, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.30249023, + "step": 961, + "time_per_iteration": 2.6328110694885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117939, + "balance_loss_mlp": 1.08651531, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0643111022382676, + "language_loss": 0.91187119, + "learning_rate": 0.0009382613583165467, + "loss": 0.92305064, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.31396484, + "step": 962, + "time_per_iteration": 2.7885348796844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116435, + "balance_loss_mlp": 1.08522642, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.08357757161984174, + "language_loss": 0.89136612, + "learning_rate": 0.0009381113090912928, + "loss": 0.90253055, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.31176758, + "step": 963, + "time_per_iteration": 2.7291858196258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109665, + "balance_loss_mlp": 1.07812214, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.08435952646587867, + "language_loss": 0.89444733, + "learning_rate": 0.000937961089776144, + "loss": 0.90554392, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.31518555, + "step": 964, + "time_per_iteration": 2.5736470222473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102378, + "balance_loss_mlp": 1.07090628, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.0989838613647617, + "language_loss": 0.82349026, + "learning_rate": 0.0009378107004294208, + "loss": 0.83451402, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.31445312, + "step": 965, + "time_per_iteration": 2.980569362640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.07982063, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.07592153009574268, + "language_loss": 0.91147316, + "learning_rate": 0.0009376601411095096, + "loss": 0.92259943, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.328125, + "step": 966, + "time_per_iteration": 2.6635591983795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136435, + "balance_loss_mlp": 1.10331881, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.16243248674453353, + "language_loss": 0.86357069, + "learning_rate": 0.0009375094118748622, + "loss": 0.87493503, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.33129883, + "step": 967, + "time_per_iteration": 2.522481679916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157231, + "balance_loss_mlp": 1.12368488, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.09362045292578998, + "language_loss": 0.90268016, + "learning_rate": 0.0009373585127839976, + "loss": 0.9142524, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.33544922, + "step": 968, + "time_per_iteration": 2.97210693359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152937, + "balance_loss_mlp": 1.1203692, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.0858654394488603, + "language_loss": 0.90605009, + "learning_rate": 0.0009372074438954994, + "loss": 0.91757941, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.32568359, + "step": 969, + "time_per_iteration": 2.541006088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143226, + "balance_loss_mlp": 1.11030006, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.08996217866854661, + "language_loss": 0.91142356, + "learning_rate": 0.0009370562052680181, + "loss": 0.92285585, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.3293457, + "step": 970, + "time_per_iteration": 2.4985642433166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113885, + "balance_loss_mlp": 1.0805068, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.07707645065684006, + "language_loss": 0.88999593, + "learning_rate": 0.0009369047969602695, + "loss": 0.90113479, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33398438, + "step": 971, + "time_per_iteration": 2.7079591751098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109357, + "balance_loss_mlp": 1.05985761, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.28998936625974164, + "language_loss": 0.86178541, + "learning_rate": 0.0009367532190310357, + "loss": 0.87272114, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.33740234, + "step": 972, + "time_per_iteration": 2.5647881031036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090596, + "balance_loss_mlp": 1.05535769, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.12045660132436305, + "language_loss": 0.89086068, + "learning_rate": 0.0009366014715391644, + "loss": 0.90176666, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.3527832, + "step": 973, + "time_per_iteration": 2.670271396636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098789, + "balance_loss_mlp": 1.06316936, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.06161121065256625, + "language_loss": 0.83607596, + "learning_rate": 0.0009364495545435693, + "loss": 0.84706378, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.35644531, + "step": 974, + "time_per_iteration": 2.7562968730926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115355, + "balance_loss_mlp": 1.08068919, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.0775906753320085, + "language_loss": 0.88572645, + "learning_rate": 0.0009362974681032297, + "loss": 0.89688003, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34692383, + "step": 975, + "time_per_iteration": 2.618015766143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115715, + "balance_loss_mlp": 1.08102489, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.0743374582836454, + "language_loss": 0.87880743, + "learning_rate": 0.0009361452122771907, + "loss": 0.88996458, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34716797, + "step": 976, + "time_per_iteration": 2.8973281383514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111997, + "balance_loss_mlp": 1.07754576, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.09294234225416288, + "language_loss": 0.83035111, + "learning_rate": 0.0009359927871245635, + "loss": 0.84147108, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34472656, + "step": 977, + "time_per_iteration": 2.4868054389953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113361, + "balance_loss_mlp": 1.079983, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.08516170058225998, + "language_loss": 0.86584175, + "learning_rate": 0.0009358401927045246, + "loss": 0.87697542, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33398438, + "step": 978, + "time_per_iteration": 2.8482747077941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_mlp": 1.07160234, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.09204359799181126, + "language_loss": 0.88258326, + "learning_rate": 0.0009356874290763166, + "loss": 0.89362299, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.32373047, + "step": 979, + "time_per_iteration": 3.4733643531799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097908, + "balance_loss_mlp": 1.06529236, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.0915662715535259, + "language_loss": 0.88419032, + "learning_rate": 0.0009355344962992474, + "loss": 0.89516938, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.32617188, + "step": 980, + "time_per_iteration": 2.650907039642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098514, + "balance_loss_mlp": 1.06606519, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.13079327807375027, + "language_loss": 0.87520993, + "learning_rate": 0.0009353813944326908, + "loss": 0.88619506, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.32446289, + "step": 981, + "time_per_iteration": 2.937286138534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090956, + "balance_loss_mlp": 1.05845952, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.0755425770798311, + "language_loss": 0.82502437, + "learning_rate": 0.0009352281235360863, + "loss": 0.83593392, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.32495117, + "step": 982, + "time_per_iteration": 2.6979949474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096297, + "balance_loss_mlp": 1.06380093, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.0751009418062393, + "language_loss": 0.8470037, + "learning_rate": 0.0009350746836689389, + "loss": 0.85796672, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32495117, + "step": 983, + "time_per_iteration": 2.538175582885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131236, + "balance_loss_mlp": 1.10624993, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.036870034223354546, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82570457, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.24902344, + "step": 984, + "time_per_iteration": 4.979044198989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097045, + "balance_loss_mlp": 1.0640955, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.0642225711410905, + "language_loss": 0.82250404, + "learning_rate": 0.0009347672972613634, + "loss": 0.83347452, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.3293457, + "step": 985, + "time_per_iteration": 2.593069553375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086897, + "balance_loss_mlp": 1.05339909, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0802805585104316, + "language_loss": 0.85205728, + "learning_rate": 0.0009346133508402735, + "loss": 0.86292624, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33520508, + "step": 986, + "time_per_iteration": 2.68485426902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.06216192, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.09481546728284458, + "language_loss": 0.84014487, + "learning_rate": 0.0009344592356873166, + "loss": 0.85110188, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33544922, + "step": 987, + "time_per_iteration": 2.6432511806488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105999, + "balance_loss_mlp": 1.07068968, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.06245857415063817, + "language_loss": 0.78166318, + "learning_rate": 0.0009343049518623255, + "loss": 0.79272318, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.35327148, + "step": 988, + "time_per_iteration": 2.7121620178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_mlp": 1.085639, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05952536728335112, + "language_loss": 0.83312774, + "learning_rate": 0.0009341504994251985, + "loss": 0.84433722, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.35327148, + "step": 989, + "time_per_iteration": 2.852208375930786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107971, + "balance_loss_mlp": 1.05224383, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03692041129742979, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74600208, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.27539062, + "step": 990, + "time_per_iteration": 4.994582414627075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137443, + "balance_loss_mlp": 1.09991539, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.056855766240422066, + "language_loss": 0.81516898, + "learning_rate": 0.0009338410889544574, + "loss": 0.82654339, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.37524414, + "step": 991, + "time_per_iteration": 3.017310380935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011468, + "balance_loss_mlp": 1.10831964, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.07195285392178245, + "language_loss": 0.87761319, + "learning_rate": 0.000933686131040967, + "loss": 0.88908118, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.38427734, + "step": 992, + "time_per_iteration": 2.8278160095214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144526, + "balance_loss_mlp": 1.10726154, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.07034922378143431, + "language_loss": 0.90235877, + "learning_rate": 0.0009335310047555883, + "loss": 0.91380405, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.37255859, + "step": 993, + "time_per_iteration": 2.8100597858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114172, + "balance_loss_mlp": 1.1050992, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06860817272021875, + "language_loss": 0.88542485, + "learning_rate": 0.0009333757101585467, + "loss": 0.896842, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.36621094, + "step": 994, + "time_per_iteration": 2.6616106033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131089, + "balance_loss_mlp": 1.0961132, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.0687364291234037, + "language_loss": 0.9324351, + "learning_rate": 0.0009332202473101329, + "loss": 0.94374597, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.3503418, + "step": 995, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128597, + "balance_loss_mlp": 1.09400272, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.07471533178048465, + "language_loss": 0.82843316, + "learning_rate": 0.0009330646162707028, + "loss": 0.83971918, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.34619141, + "step": 996, + "time_per_iteration": 2.7293272018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111164, + "balance_loss_mlp": 1.07730889, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05994533952598048, + "language_loss": 0.84315574, + "learning_rate": 0.0009329088171006779, + "loss": 0.85426736, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33886719, + "step": 997, + "time_per_iteration": 3.140655517578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110059, + "balance_loss_mlp": 1.07672858, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06034276327327584, + "language_loss": 0.85438752, + "learning_rate": 0.0009327528498605446, + "loss": 0.86548805, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.33349609, + "step": 998, + "time_per_iteration": 2.5440673828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111119, + "balance_loss_mlp": 1.0778836, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.07596013514481052, + "language_loss": 0.89179873, + "learning_rate": 0.0009325967146108548, + "loss": 0.90290987, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33251953, + "step": 999, + "time_per_iteration": 2.658561944961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110413, + "balance_loss_mlp": 1.07856011, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.07750808981236326, + "language_loss": 0.8717553, + "learning_rate": 0.0009324404114122258, + "loss": 0.88285947, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.31835938, + "step": 1000, + "time_per_iteration": 2.7275264263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107767, + "balance_loss_mlp": 1.07667685, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.11937061799335263, + "language_loss": 0.86227536, + "learning_rate": 0.0009322839403253397, + "loss": 0.873353, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.31054688, + "step": 1001, + "time_per_iteration": 2.788405656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110663, + "balance_loss_mlp": 1.0798831, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.07054171225662055, + "language_loss": 0.84055525, + "learning_rate": 0.0009321273014109439, + "loss": 0.85166192, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.30737305, + "step": 1002, + "time_per_iteration": 2.942535877227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110952, + "balance_loss_mlp": 1.0799818, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.057550289991663166, + "language_loss": 0.84200853, + "learning_rate": 0.0009319704947298513, + "loss": 0.85311806, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.30932617, + "step": 1003, + "time_per_iteration": 2.919499158859253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110832, + "balance_loss_mlp": 1.07933664, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.07245253176429253, + "language_loss": 0.88662004, + "learning_rate": 0.0009318135203429393, + "loss": 0.89772838, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31469727, + "step": 1004, + "time_per_iteration": 2.7168095111846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118727, + "balance_loss_mlp": 1.08770871, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.17670411464250102, + "language_loss": 0.8771624, + "learning_rate": 0.0009316563783111511, + "loss": 0.88834965, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.30981445, + "step": 1005, + "time_per_iteration": 2.7140395641326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116793, + "balance_loss_mlp": 1.08501196, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.08689807004334223, + "language_loss": 0.81857723, + "learning_rate": 0.0009314990686954943, + "loss": 0.82974517, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.31762695, + "step": 1006, + "time_per_iteration": 2.904555320739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106968, + "balance_loss_mlp": 1.07482958, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05703714693088015, + "language_loss": 0.80953801, + "learning_rate": 0.000931341591557042, + "loss": 0.82060766, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.32128906, + "step": 1007, + "time_per_iteration": 3.6937167644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092779, + "balance_loss_mlp": 1.06078339, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.08309123344760973, + "language_loss": 0.87180555, + "learning_rate": 0.0009311839469569325, + "loss": 0.88273335, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.31982422, + "step": 1008, + "time_per_iteration": 2.6189959049224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.06746829, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.10100018073420348, + "language_loss": 0.8730033, + "learning_rate": 0.0009310261349563687, + "loss": 0.88400197, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.32397461, + "step": 1009, + "time_per_iteration": 2.6890206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108302, + "balance_loss_mlp": 1.07516217, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.08933629042911205, + "language_loss": 0.85340321, + "learning_rate": 0.0009308681556166186, + "loss": 0.86448622, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33154297, + "step": 1010, + "time_per_iteration": 2.824448585510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098531, + "balance_loss_mlp": 1.06546259, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.16096270434238172, + "language_loss": 0.87149101, + "learning_rate": 0.0009307100089990152, + "loss": 0.88247633, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.33081055, + "step": 1011, + "time_per_iteration": 2.74092173576355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105966, + "balance_loss_mlp": 1.07070398, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.08074644620093238, + "language_loss": 0.83646113, + "learning_rate": 0.0009305516951649568, + "loss": 0.84752083, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.3527832, + "step": 1012, + "time_per_iteration": 2.7069194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110286, + "balance_loss_mlp": 1.06726432, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.06954368088501534, + "language_loss": 0.86469871, + "learning_rate": 0.0009303932141759057, + "loss": 0.8757273, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.35595703, + "step": 1013, + "time_per_iteration": 2.7547597885131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07352042, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.08663105683367789, + "language_loss": 0.83731425, + "learning_rate": 0.0009302345660933902, + "loss": 0.84840637, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.35742188, + "step": 1014, + "time_per_iteration": 2.789421319961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120247, + "balance_loss_mlp": 1.0850327, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.07248055996229082, + "language_loss": 0.85224003, + "learning_rate": 0.0009300757509790026, + "loss": 0.86344242, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.35229492, + "step": 1015, + "time_per_iteration": 2.8293235301971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138983, + "balance_loss_mlp": 1.10412574, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.08486300836715333, + "language_loss": 0.90133542, + "learning_rate": 0.0009299167688944005, + "loss": 0.91272521, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34912109, + "step": 1016, + "time_per_iteration": 2.5042884349823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130604, + "balance_loss_mlp": 1.09453082, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.08182270058547457, + "language_loss": 0.86074531, + "learning_rate": 0.0009297576199013063, + "loss": 0.87205136, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.36108398, + "step": 1017, + "time_per_iteration": 2.678986072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01400492, + "balance_loss_mlp": 1.36921108, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.11724614930420041, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74402618, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.3125, + "step": 1018, + "time_per_iteration": 4.915104627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214569, + "balance_loss_mlp": 1.18538666, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.08011150215373515, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.8064087, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.29101562, + "step": 1019, + "time_per_iteration": 5.440853834152222 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100562, + "balance_loss_mlp": 1.06565762, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05949147024105531, + "language_loss": 0.86637676, + "learning_rate": 0.0009292791720892659, + "loss": 0.8773824, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.34960938, + "step": 1020, + "time_per_iteration": 2.8909873962402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110075, + "balance_loss_mlp": 1.06534433, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.08017401986968183, + "language_loss": 0.8851831, + "learning_rate": 0.0009291193560807218, + "loss": 0.89619064, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.35424805, + "step": 1021, + "time_per_iteration": 2.5876846313476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108676, + "balance_loss_mlp": 1.07329464, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.061421548763730266, + "language_loss": 0.86832839, + "learning_rate": 0.0009289593734732688, + "loss": 0.87941515, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.35400391, + "step": 1022, + "time_per_iteration": 2.5790083408355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116018, + "balance_loss_mlp": 1.08097017, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.06446420344630455, + "language_loss": 0.93862659, + "learning_rate": 0.0009287992243290175, + "loss": 0.94978678, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.35083008, + "step": 1023, + "time_per_iteration": 2.474393844604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126064, + "balance_loss_mlp": 1.09120703, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.06850198630338038, + "language_loss": 0.90312016, + "learning_rate": 0.0009286389087101435, + "loss": 0.91438079, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.34887695, + "step": 1024, + "time_per_iteration": 2.835756540298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143856, + "balance_loss_mlp": 1.10885596, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.06824019021489727, + "language_loss": 0.88388735, + "learning_rate": 0.0009284784266788864, + "loss": 0.8953259, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.3503418, + "step": 1025, + "time_per_iteration": 2.702479839324951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144786, + "balance_loss_mlp": 1.11033428, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.08832519553576638, + "language_loss": 0.92221844, + "learning_rate": 0.0009283177782975512, + "loss": 0.93366635, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.34472656, + "step": 1026, + "time_per_iteration": 2.9851789474487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132889, + "balance_loss_mlp": 1.09850955, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.07134152927872167, + "language_loss": 0.87642545, + "learning_rate": 0.000928156963628507, + "loss": 0.88775432, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.34423828, + "step": 1027, + "time_per_iteration": 2.61114239692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131535, + "balance_loss_mlp": 1.09686899, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.0723355054215018, + "language_loss": 0.88370252, + "learning_rate": 0.0009279959827341877, + "loss": 0.8950178, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34692383, + "step": 1028, + "time_per_iteration": 2.7794618606567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118297, + "balance_loss_mlp": 1.08248627, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.08314527790784168, + "language_loss": 0.87832725, + "learning_rate": 0.0009278348356770915, + "loss": 0.88951027, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.3581543, + "step": 1029, + "time_per_iteration": 2.5507349967956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111156, + "balance_loss_mlp": 1.07565451, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.08630189211983, + "language_loss": 0.85379845, + "learning_rate": 0.0009276735225197814, + "loss": 0.864914, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.359375, + "step": 1030, + "time_per_iteration": 2.597379207611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102081, + "balance_loss_mlp": 1.06650949, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0907652175310469, + "language_loss": 0.85545719, + "learning_rate": 0.0009275120433248847, + "loss": 0.86647797, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.35571289, + "step": 1031, + "time_per_iteration": 2.687185287475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110602, + "balance_loss_mlp": 1.07545948, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.07461022440082729, + "language_loss": 0.85621846, + "learning_rate": 0.0009273503981550931, + "loss": 0.86732447, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.35205078, + "step": 1032, + "time_per_iteration": 3.0693371295928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101575, + "balance_loss_mlp": 1.06586027, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.15106160662845974, + "language_loss": 0.86904788, + "learning_rate": 0.0009271885870731626, + "loss": 0.88006359, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.35717773, + "step": 1033, + "time_per_iteration": 2.506413459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111392, + "balance_loss_mlp": 1.07536733, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.08761306204685197, + "language_loss": 0.88616383, + "learning_rate": 0.0009270266101419143, + "loss": 0.89727777, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.36035156, + "step": 1034, + "time_per_iteration": 2.6036899089813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098535, + "balance_loss_mlp": 1.06425047, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06384965023316368, + "language_loss": 0.84987146, + "learning_rate": 0.0009268644674242328, + "loss": 0.86085683, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.34301758, + "step": 1035, + "time_per_iteration": 2.7015764713287354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113397, + "balance_loss_mlp": 1.07806361, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.07882877348480413, + "language_loss": 0.80515361, + "learning_rate": 0.0009267021589830678, + "loss": 0.81628758, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.35327148, + "step": 1036, + "time_per_iteration": 2.643951892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01709033, + "balance_loss_mlp": 1.66611803, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.11391778300632174, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.79336113, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.4296875, + "step": 1037, + "time_per_iteration": 4.949443101882935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103323, + "balance_loss_mlp": 1.0683465, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.08774205983796875, + "language_loss": 0.92838657, + "learning_rate": 0.000926377045182406, + "loss": 0.93941981, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.35009766, + "step": 1038, + "time_per_iteration": 2.9512856006622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112905, + "balance_loss_mlp": 1.07821524, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.06255968137292814, + "language_loss": 0.87761998, + "learning_rate": 0.0009262142399491296, + "loss": 0.888749, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.34716797, + "step": 1039, + "time_per_iteration": 3.0552709102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112668, + "balance_loss_mlp": 1.09187126, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06862779420362043, + "language_loss": 0.87532222, + "learning_rate": 0.0009260512692448105, + "loss": 0.88658899, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.34863281, + "step": 1040, + "time_per_iteration": 2.6962392330169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140867, + "balance_loss_mlp": 1.10615349, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.07166596959521815, + "language_loss": 0.84091032, + "learning_rate": 0.000925888133132719, + "loss": 0.852319, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.34741211, + "step": 1041, + "time_per_iteration": 2.791015148162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724521, + "balance_loss_mlp": 1.67225933, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.16089622263247963, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8133496, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.5234375, + "step": 1042, + "time_per_iteration": 4.978717565536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116458, + "balance_loss_mlp": 1.08169639, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.06766738281342395, + "language_loss": 0.80769098, + "learning_rate": 0.0009255613649386244, + "loss": 0.81885552, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.34790039, + "step": 1043, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122505, + "balance_loss_mlp": 1.08709943, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.07361728486384381, + "language_loss": 0.78999138, + "learning_rate": 0.0009253977329834838, + "loss": 0.80121642, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.35449219, + "step": 1044, + "time_per_iteration": 2.7036681175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108398, + "balance_loss_mlp": 1.07227719, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.08623717161971375, + "language_loss": 0.86596096, + "learning_rate": 0.0009252339358742965, + "loss": 0.87704492, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.36108398, + "step": 1045, + "time_per_iteration": 2.874620199203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118791, + "balance_loss_mlp": 1.08369565, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.06963930913543727, + "language_loss": 0.82984746, + "learning_rate": 0.000925069973674654, + "loss": 0.84103537, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.35107422, + "step": 1046, + "time_per_iteration": 2.628878116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106078, + "balance_loss_mlp": 1.07017231, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.07870556033127275, + "language_loss": 0.88610631, + "learning_rate": 0.000924905846448212, + "loss": 0.89716709, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.35913086, + "step": 1047, + "time_per_iteration": 2.747220754623413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109682, + "balance_loss_mlp": 1.0750165, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.10747792176710873, + "language_loss": 0.85372317, + "learning_rate": 0.0009247415542586906, + "loss": 0.86482, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34667969, + "step": 1048, + "time_per_iteration": 2.8556973934173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119311, + "balance_loss_mlp": 1.08285666, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.2214820598260846, + "language_loss": 0.83177209, + "learning_rate": 0.0009245770971698735, + "loss": 0.84296525, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.36450195, + "step": 1049, + "time_per_iteration": 2.9050869941711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132964, + "balance_loss_mlp": 1.09798741, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.08175342307012821, + "language_loss": 0.88327754, + "learning_rate": 0.0009244124752456087, + "loss": 0.89460719, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.34985352, + "step": 1050, + "time_per_iteration": 2.5141613483428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151097, + "balance_loss_mlp": 1.11557305, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.06393011823673703, + "language_loss": 0.85371649, + "learning_rate": 0.0009242476885498081, + "loss": 0.86522746, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.35522461, + "step": 1051, + "time_per_iteration": 2.727687358856201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176333, + "balance_loss_mlp": 1.14171457, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.09914193731013146, + "language_loss": 0.80802011, + "learning_rate": 0.0009240827371464474, + "loss": 0.81978351, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.34643555, + "step": 1052, + "time_per_iteration": 2.552121877670288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191475, + "balance_loss_mlp": 1.15521157, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.1023503287046967, + "language_loss": 0.83863074, + "learning_rate": 0.0009239176210995666, + "loss": 0.85054547, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.36230469, + "step": 1053, + "time_per_iteration": 3.47882342338562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190284, + "balance_loss_mlp": 1.15561819, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.09115683042396579, + "language_loss": 0.93677175, + "learning_rate": 0.0009237523404732695, + "loss": 0.94867456, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34692383, + "step": 1054, + "time_per_iteration": 2.8701720237731934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173476, + "balance_loss_mlp": 1.13838029, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.10782024136876088, + "language_loss": 0.8421399, + "learning_rate": 0.0009235868953317235, + "loss": 0.85387468, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.3515625, + "step": 1055, + "time_per_iteration": 2.8210723400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161281, + "balance_loss_mlp": 1.12682986, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.07346272336072437, + "language_loss": 0.85227096, + "learning_rate": 0.0009234212857391602, + "loss": 0.86388373, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.3449707, + "step": 1056, + "time_per_iteration": 3.2212936878204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153084, + "balance_loss_mlp": 1.11727369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.054845505201833546, + "language_loss": 0.89240777, + "learning_rate": 0.000923255511759875, + "loss": 0.90393853, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.3581543, + "step": 1057, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156175, + "balance_loss_mlp": 1.12146115, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.10969304378799022, + "language_loss": 0.84913409, + "learning_rate": 0.000923089573458227, + "loss": 0.86069584, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.34716797, + "step": 1058, + "time_per_iteration": 2.8832740783691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115137, + "balance_loss_mlp": 1.1168946, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.24205150411640483, + "language_loss": 0.83790255, + "learning_rate": 0.0009229234708986392, + "loss": 0.84941626, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.3449707, + "step": 1059, + "time_per_iteration": 2.8837289810180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01633401, + "balance_loss_mlp": 1.57885134, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.08953482343612705, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.83300292, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.546875, + "step": 1060, + "time_per_iteration": 4.667459011077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158699, + "balance_loss_mlp": 1.1247009, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.0736942782322193, + "language_loss": 0.84963936, + "learning_rate": 0.0009225907732636548, + "loss": 0.86122632, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.34033203, + "step": 1061, + "time_per_iteration": 2.7532095909118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164545, + "balance_loss_mlp": 1.12954497, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.09512005659435491, + "language_loss": 0.8641578, + "learning_rate": 0.0009224241783174227, + "loss": 0.87580323, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.35009766, + "step": 1062, + "time_per_iteration": 2.683047294616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147761, + "balance_loss_mlp": 1.11347604, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.07955707081408017, + "language_loss": 0.85456479, + "learning_rate": 0.0009222574193715802, + "loss": 0.86604244, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.34326172, + "step": 1063, + "time_per_iteration": 2.8293161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139319, + "balance_loss_mlp": 1.10474837, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.08617592440024102, + "language_loss": 0.85715151, + "learning_rate": 0.000922090496490869, + "loss": 0.8685447, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.34619141, + "step": 1064, + "time_per_iteration": 2.749298334121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.08865011, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.06572729358097257, + "language_loss": 0.89767212, + "learning_rate": 0.0009219234097400937, + "loss": 0.90891409, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.35595703, + "step": 1065, + "time_per_iteration": 2.8508355617523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107138, + "balance_loss_mlp": 1.07175696, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.05918330788086957, + "language_loss": 0.82970631, + "learning_rate": 0.0009217561591841237, + "loss": 0.8407777, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.35400391, + "step": 1066, + "time_per_iteration": 3.3216452598571777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102073, + "balance_loss_mlp": 1.06566656, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09526156176010836, + "language_loss": 0.81088316, + "learning_rate": 0.0009215887448878913, + "loss": 0.82190394, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.36401367, + "step": 1067, + "time_per_iteration": 2.596022129058838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06191611, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.072135210200994, + "language_loss": 0.84963661, + "learning_rate": 0.0009214211669163922, + "loss": 0.86063439, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.37841797, + "step": 1068, + "time_per_iteration": 4.440082311630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096187, + "balance_loss_mlp": 1.05923223, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.07010547570027807, + "language_loss": 0.93398243, + "learning_rate": 0.0009212534253346862, + "loss": 0.94494426, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.36938477, + "step": 1069, + "time_per_iteration": 2.699843406677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096083, + "balance_loss_mlp": 1.05912852, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.07799270520419531, + "language_loss": 0.83685625, + "learning_rate": 0.0009210855202078964, + "loss": 0.84781706, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.36962891, + "step": 1070, + "time_per_iteration": 2.5999720096588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010932, + "balance_loss_mlp": 1.05810475, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.0723710550133871, + "language_loss": 0.86933672, + "learning_rate": 0.0009209174516012091, + "loss": 0.88026869, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.35131836, + "step": 1071, + "time_per_iteration": 2.503551483154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092567, + "balance_loss_mlp": 1.05794883, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.05962541016594441, + "language_loss": 0.88928151, + "learning_rate": 0.0009207492195798747, + "loss": 0.90020716, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.34667969, + "step": 1072, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094226, + "balance_loss_mlp": 1.05972731, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.06398863953592046, + "language_loss": 0.84846818, + "learning_rate": 0.0009205808242092061, + "loss": 0.85941041, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34521484, + "step": 1073, + "time_per_iteration": 2.644134044647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_mlp": 1.06080186, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.06666861242543158, + "language_loss": 0.82488537, + "learning_rate": 0.0009204122655545808, + "loss": 0.83583593, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34277344, + "step": 1074, + "time_per_iteration": 3.3254919052124023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110011, + "balance_loss_mlp": 1.07582152, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.0719401545163873, + "language_loss": 0.81125832, + "learning_rate": 0.0009202435436814388, + "loss": 0.82235849, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.34228516, + "step": 1075, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105303, + "balance_loss_mlp": 1.0707798, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.06775779875999222, + "language_loss": 0.89715004, + "learning_rate": 0.0009200746586552836, + "loss": 0.90820301, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.34545898, + "step": 1076, + "time_per_iteration": 2.897177219390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102531, + "balance_loss_mlp": 1.06869972, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.12065235325240355, + "language_loss": 0.83624744, + "learning_rate": 0.0009199056105416825, + "loss": 0.84727275, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33862305, + "step": 1077, + "time_per_iteration": 3.0771028995513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106086, + "balance_loss_mlp": 1.07218289, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.06486814220319007, + "language_loss": 0.8622663, + "learning_rate": 0.0009197363994062654, + "loss": 0.8733272, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.33935547, + "step": 1078, + "time_per_iteration": 2.807009696960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112785, + "balance_loss_mlp": 1.07914448, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.06985523034062016, + "language_loss": 0.84313667, + "learning_rate": 0.0009195670253147262, + "loss": 0.85426456, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.33642578, + "step": 1079, + "time_per_iteration": 2.9738564491271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114515, + "balance_loss_mlp": 1.0817802, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.09202653272357895, + "language_loss": 0.81912923, + "learning_rate": 0.0009193974883328216, + "loss": 0.8302744, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32739258, + "step": 1080, + "time_per_iteration": 2.639878511428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121501, + "balance_loss_mlp": 1.08721614, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.059797822691547486, + "language_loss": 0.86745334, + "learning_rate": 0.0009192277885263718, + "loss": 0.87866837, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.34326172, + "step": 1081, + "time_per_iteration": 4.060026407241821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.08671248, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.0682125291941454, + "language_loss": 0.86169523, + "learning_rate": 0.0009190579259612602, + "loss": 0.87289995, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33789062, + "step": 1082, + "time_per_iteration": 3.2795815467834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134326, + "balance_loss_mlp": 1.10132933, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.06852391956291448, + "language_loss": 0.86675245, + "learning_rate": 0.000918887900703433, + "loss": 0.87809569, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.33007812, + "step": 1083, + "time_per_iteration": 2.813777208328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137242, + "balance_loss_mlp": 1.1025995, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.07184608102087402, + "language_loss": 0.90139276, + "learning_rate": 0.0009187177128188999, + "loss": 0.91276515, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.34667969, + "step": 1084, + "time_per_iteration": 2.4950854778289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01361857, + "balance_loss_mlp": 1.30883229, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.057507491560350586, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78518397, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.53125, + "step": 1085, + "time_per_iteration": 4.9323132038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.08279717, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.0734883897044225, + "language_loss": 0.85634506, + "learning_rate": 0.000918376849434071, + "loss": 0.86751348, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.34057617, + "step": 1086, + "time_per_iteration": 2.504467487335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110856, + "balance_loss_mlp": 1.07680964, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07305298195252904, + "language_loss": 0.90630972, + "learning_rate": 0.0009182061740661098, + "loss": 0.91741836, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34057617, + "step": 1087, + "time_per_iteration": 2.5760254859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_mlp": 1.0785315, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05349746945174757, + "language_loss": 0.84760422, + "learning_rate": 0.0009180353363361127, + "loss": 0.85873878, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.34912109, + "step": 1088, + "time_per_iteration": 3.0988333225250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111767, + "balance_loss_mlp": 1.07593286, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.0658577902216117, + "language_loss": 0.81715566, + "learning_rate": 0.0009178643363104044, + "loss": 0.82827336, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.35864258, + "step": 1089, + "time_per_iteration": 3.1410629749298096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106557, + "balance_loss_mlp": 1.07155704, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.10460691940838339, + "language_loss": 0.90569937, + "learning_rate": 0.0009176931740553735, + "loss": 0.91676497, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.35009766, + "step": 1090, + "time_per_iteration": 2.529330253601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112911, + "balance_loss_mlp": 1.07698107, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.07113631656774884, + "language_loss": 0.82557011, + "learning_rate": 0.0009175218496374708, + "loss": 0.83669925, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.359375, + "step": 1091, + "time_per_iteration": 3.347742795944214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110472, + "balance_loss_mlp": 1.07356465, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.08284412758413852, + "language_loss": 0.85813856, + "learning_rate": 0.0009173503631232103, + "loss": 0.86924326, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.36914062, + "step": 1092, + "time_per_iteration": 3.378859758377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103445, + "balance_loss_mlp": 1.06684804, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.09413161778101656, + "language_loss": 0.81595004, + "learning_rate": 0.0009171787145791691, + "loss": 0.82698447, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.36621094, + "step": 1093, + "time_per_iteration": 3.215574026107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_mlp": 1.06214595, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.0806437411167059, + "language_loss": 0.80327773, + "learning_rate": 0.000917006904071987, + "loss": 0.81427377, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.37451172, + "step": 1094, + "time_per_iteration": 2.6117537021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100105, + "balance_loss_mlp": 1.06377053, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.08991830585001004, + "language_loss": 0.87576157, + "learning_rate": 0.0009168349316683669, + "loss": 0.88676262, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.36352539, + "step": 1095, + "time_per_iteration": 2.740950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101819, + "balance_loss_mlp": 1.06650949, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.06267137937039592, + "language_loss": 0.8218863, + "learning_rate": 0.0009166627974350741, + "loss": 0.83290446, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.35327148, + "step": 1096, + "time_per_iteration": 2.887326240539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098665, + "balance_loss_mlp": 1.06206763, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.07019696164219995, + "language_loss": 0.89238816, + "learning_rate": 0.0009164905014389373, + "loss": 0.90337479, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.3659668, + "step": 1097, + "time_per_iteration": 2.7609455585479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105326, + "balance_loss_mlp": 1.06908655, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.06528725154368942, + "language_loss": 0.8638711, + "learning_rate": 0.0009163180437468476, + "loss": 0.87492442, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.36254883, + "step": 1098, + "time_per_iteration": 2.5998973846435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096402, + "balance_loss_mlp": 1.06009042, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.06547964129234486, + "language_loss": 0.85908926, + "learning_rate": 0.000916145424425759, + "loss": 0.87005323, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.36303711, + "step": 1099, + "time_per_iteration": 2.6804425716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_mlp": 1.06601155, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.08063804967749887, + "language_loss": 0.90475744, + "learning_rate": 0.0009159726435426885, + "loss": 0.91577733, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.35986328, + "step": 1100, + "time_per_iteration": 3.1017394065856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100921, + "balance_loss_mlp": 1.06499124, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.08023517310436831, + "language_loss": 0.90250683, + "learning_rate": 0.0009157997011647154, + "loss": 0.9135161, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.359375, + "step": 1101, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_mlp": 1.06045425, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05508329212621071, + "language_loss": 0.86001104, + "learning_rate": 0.0009156265973589817, + "loss": 0.87097728, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.36206055, + "step": 1102, + "time_per_iteration": 2.7933261394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097006, + "balance_loss_mlp": 1.06121981, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.06583201442001711, + "language_loss": 0.89802408, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899414, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.35791016, + "step": 1103, + "time_per_iteration": 2.647494316101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096343, + "balance_loss_mlp": 1.0598892, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06603869229078199, + "language_loss": 0.87027407, + "learning_rate": 0.0009152799057331156, + "loss": 0.88123751, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.36499023, + "step": 1104, + "time_per_iteration": 3.1623916625976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097231, + "balance_loss_mlp": 1.06134939, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.07161611233178561, + "language_loss": 0.90831178, + "learning_rate": 0.0009151063180475805, + "loss": 0.91928405, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.35913086, + "step": 1105, + "time_per_iteration": 2.5515594482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099591, + "balance_loss_mlp": 1.06516361, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.08899576142412509, + "language_loss": 0.83941323, + "learning_rate": 0.0009149325692034803, + "loss": 0.85040915, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.34472656, + "step": 1106, + "time_per_iteration": 2.561875343322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300575, + "balance_loss_mlp": 1.25708735, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.05662804479307553, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80504, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.43554688, + "step": 1107, + "time_per_iteration": 4.880220174789429 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104038, + "balance_loss_mlp": 1.06870413, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.06711298172071122, + "language_loss": 0.87037283, + "learning_rate": 0.0009145845883094678, + "loss": 0.88141322, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.35375977, + "step": 1108, + "time_per_iteration": 3.0598409175872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105103, + "balance_loss_mlp": 1.06931639, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.06803775359788228, + "language_loss": 0.8464098, + "learning_rate": 0.000914410356394654, + "loss": 0.85746086, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.35839844, + "step": 1109, + "time_per_iteration": 2.776258945465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103906, + "balance_loss_mlp": 1.06799972, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.052025780444459935, + "language_loss": 0.84733951, + "learning_rate": 0.0009142359635914709, + "loss": 0.85837853, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.35913086, + "step": 1110, + "time_per_iteration": 3.057307243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096278, + "balance_loss_mlp": 1.05996692, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.10914443694781037, + "language_loss": 0.84286684, + "learning_rate": 0.0009140614099676245, + "loss": 0.85382962, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.36328125, + "step": 1111, + "time_per_iteration": 2.6110692024230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.0517633, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.09545242357915729, + "language_loss": 0.82540983, + "learning_rate": 0.0009138866955908821, + "loss": 0.83628869, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.36132812, + "step": 1112, + "time_per_iteration": 2.870765209197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100291, + "balance_loss_mlp": 1.06445658, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06321568237144509, + "language_loss": 0.8048408, + "learning_rate": 0.0009137118205290738, + "loss": 0.8158437, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.35864258, + "step": 1113, + "time_per_iteration": 4.381570100784302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097039, + "balance_loss_mlp": 1.06091869, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06328361159326604, + "language_loss": 0.89779603, + "learning_rate": 0.0009135367848500924, + "loss": 0.90876651, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.36157227, + "step": 1114, + "time_per_iteration": 2.511164665222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.06034184, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.08987717155463379, + "language_loss": 0.86417669, + "learning_rate": 0.0009133615886218927, + "loss": 0.87514299, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.36303711, + "step": 1115, + "time_per_iteration": 2.7101125717163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089806, + "balance_loss_mlp": 1.05337584, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.07119429557645003, + "language_loss": 0.87869287, + "learning_rate": 0.0009131862319124917, + "loss": 0.88959092, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.36425781, + "step": 1116, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.05648971, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06965010238630005, + "language_loss": 0.83447617, + "learning_rate": 0.0009130107147899691, + "loss": 0.84540606, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.36499023, + "step": 1117, + "time_per_iteration": 2.723092794418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094347, + "balance_loss_mlp": 1.05805993, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.055087901571477416, + "language_loss": 0.84983969, + "learning_rate": 0.0009128350373224665, + "loss": 0.8607831, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.36352539, + "step": 1118, + "time_per_iteration": 2.5449509620666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178954, + "balance_loss_mlp": 1.14500344, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.021865185871831474, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82635385, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.33984375, + "step": 1119, + "time_per_iteration": 4.641271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103175, + "balance_loss_mlp": 1.06648207, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.07523243301623007, + "language_loss": 0.85678464, + "learning_rate": 0.0009124832016254005, + "loss": 0.86781639, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.36694336, + "step": 1120, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109795, + "balance_loss_mlp": 1.06163859, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.07092227494936269, + "language_loss": 0.87677884, + "learning_rate": 0.0009123070435324316, + "loss": 0.88775837, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.36352539, + "step": 1121, + "time_per_iteration": 2.777632236480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166186, + "balance_loss_mlp": 1.13337982, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.01899876446696313, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.7904197, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.328125, + "step": 1122, + "time_per_iteration": 4.966520547866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.0522635, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.060329223802114536, + "language_loss": 0.86415493, + "learning_rate": 0.0009119542471995752, + "loss": 0.87504709, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.36938477, + "step": 1123, + "time_per_iteration": 2.8373889923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090311, + "balance_loss_mlp": 1.05438125, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06176848453484022, + "language_loss": 0.81323773, + "learning_rate": 0.0009117776090966554, + "loss": 0.82414079, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.359375, + "step": 1124, + "time_per_iteration": 2.999127149581909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087355, + "balance_loss_mlp": 1.0507102, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.07470238986110685, + "language_loss": 0.86757743, + "learning_rate": 0.0009116008111274899, + "loss": 0.87845105, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.36669922, + "step": 1125, + "time_per_iteration": 3.3534371852874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160744, + "balance_loss_mlp": 1.13022673, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.021433456679081614, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80267668, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.3046875, + "step": 1126, + "time_per_iteration": 4.8522608280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086571, + "balance_loss_mlp": 1.04975939, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.07895568764354688, + "language_loss": 0.85050654, + "learning_rate": 0.0009112467358650396, + "loss": 0.86137229, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.36816406, + "step": 1127, + "time_per_iteration": 3.157684803009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05472374, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.05660039583272807, + "language_loss": 0.86175025, + "learning_rate": 0.0009110694587092192, + "loss": 0.87265825, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.36108398, + "step": 1128, + "time_per_iteration": 2.755575656890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088206, + "balance_loss_mlp": 1.052562, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.077592311143443, + "language_loss": 0.81304091, + "learning_rate": 0.0009108920219620815, + "loss": 0.82392299, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35693359, + "step": 1129, + "time_per_iteration": 2.639261484146118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091101, + "balance_loss_mlp": 1.05548096, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.06998872933736075, + "language_loss": 0.8949976, + "learning_rate": 0.0009107144256925133, + "loss": 0.90590858, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35620117, + "step": 1130, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096157, + "balance_loss_mlp": 1.0606091, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.08228743876345572, + "language_loss": 0.81527102, + "learning_rate": 0.0009105366699694638, + "loss": 0.82623267, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.35546875, + "step": 1131, + "time_per_iteration": 2.726532220840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087405, + "balance_loss_mlp": 1.0526911, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.05363867293402688, + "language_loss": 0.81731898, + "learning_rate": 0.0009103587548619439, + "loss": 0.82819301, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.34741211, + "step": 1132, + "time_per_iteration": 2.856782913208008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.05978799, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.0659512575968049, + "language_loss": 0.85836411, + "learning_rate": 0.0009101806804390261, + "loss": 0.8693251, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.36328125, + "step": 1133, + "time_per_iteration": 2.789860725402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093043, + "balance_loss_mlp": 1.056494, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.06887538910693401, + "language_loss": 0.90261114, + "learning_rate": 0.0009100024467698453, + "loss": 0.91354156, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.3659668, + "step": 1134, + "time_per_iteration": 2.6074166297912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.05786586, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07516267041517319, + "language_loss": 0.82424915, + "learning_rate": 0.0009098240539235981, + "loss": 0.83520383, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.37573242, + "step": 1135, + "time_per_iteration": 2.6695401668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095721, + "balance_loss_mlp": 1.05809808, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.07818229339121877, + "language_loss": 0.87811279, + "learning_rate": 0.0009096455019695423, + "loss": 0.88907003, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.3762207, + "step": 1136, + "time_per_iteration": 4.259606838226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088969, + "balance_loss_mlp": 1.05180001, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.07138569527580692, + "language_loss": 0.89539087, + "learning_rate": 0.000909466790976998, + "loss": 0.90628058, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.37182617, + "step": 1137, + "time_per_iteration": 2.4586610794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086709, + "balance_loss_mlp": 1.0483948, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.07428895088203294, + "language_loss": 0.82083362, + "learning_rate": 0.0009092879210153473, + "loss": 0.83170068, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.38305664, + "step": 1138, + "time_per_iteration": 3.097928524017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087285, + "balance_loss_mlp": 1.04944801, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.07001266476470332, + "language_loss": 0.88581419, + "learning_rate": 0.0009091088921540333, + "loss": 0.89668703, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.37817383, + "step": 1139, + "time_per_iteration": 2.5904369354248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138075, + "balance_loss_mlp": 1.11270714, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.032290681216211516, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76646751, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.25390625, + "step": 1140, + "time_per_iteration": 4.913591623306274 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090399, + "balance_loss_mlp": 1.05353999, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.1397659602768512, + "language_loss": 0.84288347, + "learning_rate": 0.0009087503580104985, + "loss": 0.85378748, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.36865234, + "step": 1141, + "time_per_iteration": 2.6825575828552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103171, + "balance_loss_mlp": 1.06602514, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0722566511462073, + "language_loss": 0.79141879, + "learning_rate": 0.0009085708528674728, + "loss": 0.80245048, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.37133789, + "step": 1142, + "time_per_iteration": 2.8078551292419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104306, + "balance_loss_mlp": 1.06551528, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.06720954872782575, + "language_loss": 0.8638975, + "learning_rate": 0.0009083911891031745, + "loss": 0.87494051, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.38793945, + "step": 1143, + "time_per_iteration": 3.1356892585754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.07328963, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.08162422903338651, + "language_loss": 0.91253042, + "learning_rate": 0.0009082113667873553, + "loss": 0.92363143, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3684082, + "step": 1144, + "time_per_iteration": 3.1446304321289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112165, + "balance_loss_mlp": 1.07387483, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.0676762249982335, + "language_loss": 0.90471655, + "learning_rate": 0.0009080313859898283, + "loss": 0.91583818, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.38256836, + "step": 1145, + "time_per_iteration": 2.5298025608062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110814, + "balance_loss_mlp": 1.07082736, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.13336101787368373, + "language_loss": 0.91929018, + "learning_rate": 0.0009078512467804684, + "loss": 0.93037164, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.37304688, + "step": 1146, + "time_per_iteration": 2.6156158447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105973, + "balance_loss_mlp": 1.06882787, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06165136945539885, + "language_loss": 0.89993024, + "learning_rate": 0.0009076709492292119, + "loss": 0.91098994, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.37133789, + "step": 1147, + "time_per_iteration": 2.617534875869751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095299, + "balance_loss_mlp": 1.06032324, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.11177878536303132, + "language_loss": 0.88637269, + "learning_rate": 0.0009074904934060562, + "loss": 0.89732569, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34985352, + "step": 1148, + "time_per_iteration": 2.6782190799713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086783, + "balance_loss_mlp": 1.05237889, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.0637571078176039, + "language_loss": 0.84905714, + "learning_rate": 0.0009073098793810607, + "loss": 0.85992491, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.34423828, + "step": 1149, + "time_per_iteration": 2.956638813018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085311, + "balance_loss_mlp": 1.04969168, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07731387173425769, + "language_loss": 0.8803097, + "learning_rate": 0.000907129107224346, + "loss": 0.89116287, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35595703, + "step": 1150, + "time_per_iteration": 2.724456548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04623771, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.0527541061714234, + "language_loss": 0.88156152, + "learning_rate": 0.0009069481770060939, + "loss": 0.89237529, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35180664, + "step": 1151, + "time_per_iteration": 2.6539950370788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.04811299, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.06610336138884995, + "language_loss": 0.83768857, + "learning_rate": 0.000906767088796548, + "loss": 0.84853232, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.36279297, + "step": 1152, + "time_per_iteration": 3.4304041862487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087423, + "balance_loss_mlp": 1.05147004, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.06692160227790218, + "language_loss": 0.87012255, + "learning_rate": 0.0009065858426660127, + "loss": 0.88099682, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.35986328, + "step": 1153, + "time_per_iteration": 2.639326333999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089801, + "balance_loss_mlp": 1.05480099, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.07963844060104928, + "language_loss": 0.84658396, + "learning_rate": 0.0009064044386848543, + "loss": 0.85748196, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.3503418, + "step": 1154, + "time_per_iteration": 2.904387950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094705, + "balance_loss_mlp": 1.05992007, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.07985092329826342, + "language_loss": 0.88786525, + "learning_rate": 0.0009062228769234997, + "loss": 0.89881229, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.34838867, + "step": 1155, + "time_per_iteration": 2.547041416168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095087, + "balance_loss_mlp": 1.05977738, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.067267193175655, + "language_loss": 0.80872244, + "learning_rate": 0.0009060411574524376, + "loss": 0.81967336, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35327148, + "step": 1156, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100356, + "balance_loss_mlp": 1.06561852, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.07018019580992392, + "language_loss": 0.87947989, + "learning_rate": 0.0009058592803422178, + "loss": 0.8904835, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34765625, + "step": 1157, + "time_per_iteration": 3.161827564239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087783, + "balance_loss_mlp": 1.05688405, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.0269537140509509, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79798073, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.30859375, + "step": 1158, + "time_per_iteration": 4.827271223068237 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100633, + "balance_loss_mlp": 1.06608617, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.10870396219255896, + "language_loss": 0.89957273, + "learning_rate": 0.00090549505348681, + "loss": 0.91057909, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.34594727, + "step": 1159, + "time_per_iteration": 2.5724213123321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115899, + "balance_loss_mlp": 1.08144796, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.06607938149323832, + "language_loss": 0.83976638, + "learning_rate": 0.0009053127038830275, + "loss": 0.85092539, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.3449707, + "step": 1160, + "time_per_iteration": 2.979442834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108838, + "balance_loss_mlp": 1.07538772, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.07010640296313479, + "language_loss": 0.86946774, + "learning_rate": 0.000905130196922898, + "loss": 0.88055611, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3347168, + "step": 1161, + "time_per_iteration": 2.582780361175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114166, + "balance_loss_mlp": 1.0797379, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.056850955952103474, + "language_loss": 0.86954904, + "learning_rate": 0.0009049475326772769, + "loss": 0.88069069, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.34472656, + "step": 1162, + "time_per_iteration": 2.572434902191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116085, + "balance_loss_mlp": 1.08270645, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.07142312953148652, + "language_loss": 0.82233834, + "learning_rate": 0.0009047647112170811, + "loss": 0.83349919, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.33398438, + "step": 1163, + "time_per_iteration": 2.7467033863067627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104962, + "balance_loss_mlp": 1.07115388, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.07009650422776509, + "language_loss": 0.87291974, + "learning_rate": 0.0009045817326132876, + "loss": 0.88396937, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.33837891, + "step": 1164, + "time_per_iteration": 3.6699986457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096597, + "balance_loss_mlp": 1.06150198, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.07687995911666942, + "language_loss": 0.8312459, + "learning_rate": 0.0009043985969369357, + "loss": 0.84221184, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35131836, + "step": 1165, + "time_per_iteration": 2.8716225624084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099545, + "balance_loss_mlp": 1.06461644, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.062241931717823204, + "language_loss": 0.84419966, + "learning_rate": 0.0009042153042591245, + "loss": 0.85519511, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.34960938, + "step": 1166, + "time_per_iteration": 2.8038439750671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094194, + "balance_loss_mlp": 1.05971861, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.05754676867835885, + "language_loss": 0.85229421, + "learning_rate": 0.0009040318546510146, + "loss": 0.86323619, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.3449707, + "step": 1167, + "time_per_iteration": 3.166391372680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101578, + "balance_loss_mlp": 1.06672144, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06328547350255756, + "language_loss": 0.84822267, + "learning_rate": 0.0009038482481838275, + "loss": 0.85923845, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.34887695, + "step": 1168, + "time_per_iteration": 2.6582534313201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092575, + "balance_loss_mlp": 1.05726552, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05398415615287821, + "language_loss": 0.8685748, + "learning_rate": 0.0009036644849288455, + "loss": 0.87950051, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35327148, + "step": 1169, + "time_per_iteration": 3.131391763687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102043, + "balance_loss_mlp": 1.06735337, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06156740204868492, + "language_loss": 0.85189641, + "learning_rate": 0.0009034805649574118, + "loss": 0.86291689, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.34716797, + "step": 1170, + "time_per_iteration": 2.662177801132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093313, + "balance_loss_mlp": 1.05991113, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.07489985201842045, + "language_loss": 0.85256809, + "learning_rate": 0.0009032964883409308, + "loss": 0.86350119, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.33422852, + "step": 1171, + "time_per_iteration": 2.872305393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102332, + "balance_loss_mlp": 0.9971894, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.01784679187957182, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74073857, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.26171875, + "step": 1172, + "time_per_iteration": 4.968618154525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090705, + "balance_loss_mlp": 1.05649197, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.05674331384718379, + "language_loss": 0.87210125, + "learning_rate": 0.0009029278654587462, + "loss": 0.88300836, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.3425293, + "step": 1173, + "time_per_iteration": 2.5812408924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05043077, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06970392839419266, + "language_loss": 0.82089472, + "learning_rate": 0.0009027433193361548, + "loss": 0.83174634, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.34765625, + "step": 1174, + "time_per_iteration": 2.7284860610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090288, + "balance_loss_mlp": 1.0550499, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.05615396633220104, + "language_loss": 0.86867499, + "learning_rate": 0.00090255861685474, + "loss": 0.87957788, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.3527832, + "step": 1175, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085885, + "balance_loss_mlp": 1.05040812, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06159717434172949, + "language_loss": 0.91109395, + "learning_rate": 0.0009023737580862095, + "loss": 0.92195278, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.35473633, + "step": 1176, + "time_per_iteration": 2.5320050716400146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089039, + "balance_loss_mlp": 1.05468273, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05820331342721636, + "language_loss": 0.82901466, + "learning_rate": 0.0009021887431023321, + "loss": 0.83990508, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34399414, + "step": 1177, + "time_per_iteration": 2.619271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094278, + "balance_loss_mlp": 1.05939722, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05650773027793175, + "language_loss": 0.86773884, + "learning_rate": 0.0009020035719749369, + "loss": 0.8786816, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.34912109, + "step": 1178, + "time_per_iteration": 2.7209300994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010885, + "balance_loss_mlp": 1.05536032, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.07505314575513819, + "language_loss": 0.77450001, + "learning_rate": 0.0009018182447759136, + "loss": 0.78538495, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.33154297, + "step": 1179, + "time_per_iteration": 2.957627534866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.05793107, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0724719412784609, + "language_loss": 0.79327267, + "learning_rate": 0.0009016327615772126, + "loss": 0.80419827, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.34619141, + "step": 1180, + "time_per_iteration": 2.9636237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098683, + "balance_loss_mlp": 1.06425512, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06868963719018656, + "language_loss": 0.87725425, + "learning_rate": 0.0009014471224508451, + "loss": 0.88824105, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34448242, + "step": 1181, + "time_per_iteration": 2.6756978034973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101065, + "balance_loss_mlp": 1.06725717, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.08625014316755293, + "language_loss": 0.8279528, + "learning_rate": 0.0009012613274688823, + "loss": 0.83896345, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.33837891, + "step": 1182, + "time_per_iteration": 2.679690361022949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106597, + "balance_loss_mlp": 1.0716213, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.07160666852762332, + "language_loss": 0.87420428, + "learning_rate": 0.0009010753767034565, + "loss": 0.8852703, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35009766, + "step": 1183, + "time_per_iteration": 2.56422758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110957, + "balance_loss_mlp": 1.07514668, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07593119142071596, + "language_loss": 0.7905606, + "learning_rate": 0.0009008892702267599, + "loss": 0.80167019, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.35839844, + "step": 1184, + "time_per_iteration": 2.96954345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138099, + "balance_loss_mlp": 1.10255075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.08993468677273868, + "language_loss": 0.88719535, + "learning_rate": 0.0009007030081110457, + "loss": 0.89857626, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35571289, + "step": 1185, + "time_per_iteration": 2.639239549636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.08923352, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.08461110053036625, + "language_loss": 0.84618473, + "learning_rate": 0.000900516590428627, + "loss": 0.85743326, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35668945, + "step": 1186, + "time_per_iteration": 2.6506764888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120731, + "balance_loss_mlp": 1.08637488, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.07299458038970587, + "language_loss": 0.89267749, + "learning_rate": 0.0009003300172518778, + "loss": 0.90388483, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34399414, + "step": 1187, + "time_per_iteration": 2.6919267177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107415, + "balance_loss_mlp": 1.07291603, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.06786881834878318, + "language_loss": 0.83963048, + "learning_rate": 0.0009001432886532321, + "loss": 0.85070467, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.34521484, + "step": 1188, + "time_per_iteration": 2.9668681621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103209, + "balance_loss_mlp": 1.07002091, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06096375157572686, + "language_loss": 0.86560941, + "learning_rate": 0.0008999564047051843, + "loss": 0.87664151, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.33203125, + "step": 1189, + "time_per_iteration": 2.520157814025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103848, + "balance_loss_mlp": 1.07070816, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.07257222459915597, + "language_loss": 0.84934878, + "learning_rate": 0.0008997693654802894, + "loss": 0.86038733, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.33154297, + "step": 1190, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117207, + "balance_loss_mlp": 1.08375657, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.056681488577390256, + "language_loss": 0.86392069, + "learning_rate": 0.0008995821710511625, + "loss": 0.87509274, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.3347168, + "step": 1191, + "time_per_iteration": 2.727444887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116667, + "balance_loss_mlp": 1.08369398, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06323137320540088, + "language_loss": 0.85004956, + "learning_rate": 0.0008993948214904786, + "loss": 0.86121625, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.32983398, + "step": 1192, + "time_per_iteration": 2.5774295330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086531, + "balance_loss_mlp": 1.06097257, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.030992800338245956, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79508746, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.25585938, + "step": 1193, + "time_per_iteration": 4.854384422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122219, + "balance_loss_mlp": 1.08934152, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06852039575110529, + "language_loss": 0.7808823, + "learning_rate": 0.0008990196572654427, + "loss": 0.79210448, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.32861328, + "step": 1194, + "time_per_iteration": 2.873081684112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112553, + "balance_loss_mlp": 1.07943714, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.05701230798072306, + "language_loss": 0.87415946, + "learning_rate": 0.0008988318427467426, + "loss": 0.88528502, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.33105469, + "step": 1195, + "time_per_iteration": 2.702685832977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.06522477, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06940657308766013, + "language_loss": 0.85968834, + "learning_rate": 0.0008986438733877887, + "loss": 0.87066793, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.32739258, + "step": 1196, + "time_per_iteration": 3.4571969509124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096888, + "balance_loss_mlp": 1.06482017, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04726997036122248, + "language_loss": 0.83756924, + "learning_rate": 0.0008984557492615576, + "loss": 0.8485381, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.32055664, + "step": 1197, + "time_per_iteration": 2.9306819438934326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090156, + "balance_loss_mlp": 1.05718327, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.05994921168989351, + "language_loss": 0.89349306, + "learning_rate": 0.0008982674704410854, + "loss": 0.90439463, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.32983398, + "step": 1198, + "time_per_iteration": 2.706496238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089604, + "balance_loss_mlp": 1.05648804, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06548245075345789, + "language_loss": 0.7739616, + "learning_rate": 0.0008980790369994682, + "loss": 0.78485769, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.33129883, + "step": 1199, + "time_per_iteration": 2.962169647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109754, + "balance_loss_mlp": 1.06375623, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.06722903582933262, + "language_loss": 0.86851013, + "learning_rate": 0.000897890449009863, + "loss": 0.87948549, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.33813477, + "step": 1200, + "time_per_iteration": 2.6820433139801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092921, + "balance_loss_mlp": 1.05877972, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.051980143810921, + "language_loss": 0.89933294, + "learning_rate": 0.0008977017065454853, + "loss": 0.91026211, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.34179688, + "step": 1201, + "time_per_iteration": 2.6699435710906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098988, + "balance_loss_mlp": 1.0640595, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.0699249838794834, + "language_loss": 0.80333388, + "learning_rate": 0.0008975128096796121, + "loss": 0.81432372, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34936523, + "step": 1202, + "time_per_iteration": 2.891552448272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0627346, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.08096245126913681, + "language_loss": 0.85447264, + "learning_rate": 0.0008973237584855794, + "loss": 0.86543471, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.33496094, + "step": 1203, + "time_per_iteration": 2.897143840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109386, + "balance_loss_mlp": 1.06007552, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.07003086272099243, + "language_loss": 0.82261837, + "learning_rate": 0.0008971345530367832, + "loss": 0.83355689, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.33789062, + "step": 1204, + "time_per_iteration": 2.4648683071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090052, + "balance_loss_mlp": 1.05619669, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.0706025487590865, + "language_loss": 0.84670615, + "learning_rate": 0.0008969451934066799, + "loss": 0.85760665, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.33862305, + "step": 1205, + "time_per_iteration": 2.7628865242004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096032, + "balance_loss_mlp": 1.06274843, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.07866862210425928, + "language_loss": 0.79702371, + "learning_rate": 0.0008967556796687854, + "loss": 0.80798399, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.33276367, + "step": 1206, + "time_per_iteration": 2.8876569271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099743, + "balance_loss_mlp": 1.06746101, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05955020850576899, + "language_loss": 0.83383894, + "learning_rate": 0.0008965660118966752, + "loss": 0.84483635, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.32275391, + "step": 1207, + "time_per_iteration": 2.8915722370147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092945, + "balance_loss_mlp": 1.06087792, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.05733195861059391, + "language_loss": 0.89860612, + "learning_rate": 0.0008963761901639851, + "loss": 0.90953553, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.32055664, + "step": 1208, + "time_per_iteration": 2.839872121810913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100551, + "balance_loss_mlp": 1.06843603, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.0677808606719883, + "language_loss": 0.83122128, + "learning_rate": 0.0008961862145444103, + "loss": 0.84222686, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.32104492, + "step": 1209, + "time_per_iteration": 2.723395824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109998, + "balance_loss_mlp": 1.07726288, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06757554355714504, + "language_loss": 0.8539983, + "learning_rate": 0.0008959960851117059, + "loss": 0.86509824, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.32739258, + "step": 1210, + "time_per_iteration": 2.5843160152435303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.08055305, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.06719057665627333, + "language_loss": 0.83744979, + "learning_rate": 0.0008958058019396868, + "loss": 0.84857744, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.32202148, + "step": 1211, + "time_per_iteration": 2.790137529373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110465, + "balance_loss_mlp": 1.07865953, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.061561154104104274, + "language_loss": 0.86634141, + "learning_rate": 0.0008956153651022274, + "loss": 0.877446, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.31787109, + "step": 1212, + "time_per_iteration": 2.6943769454956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107151, + "balance_loss_mlp": 1.07506013, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.056352889191353187, + "language_loss": 0.84060359, + "learning_rate": 0.0008954247746732618, + "loss": 0.85167515, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.32080078, + "step": 1213, + "time_per_iteration": 2.635540723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105894, + "balance_loss_mlp": 1.07504261, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.059598265922157306, + "language_loss": 0.90450746, + "learning_rate": 0.0008952340307267837, + "loss": 0.91556644, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.30810547, + "step": 1214, + "time_per_iteration": 2.8842196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098908, + "balance_loss_mlp": 1.06817579, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.059513387141436946, + "language_loss": 0.83485198, + "learning_rate": 0.0008950431333368468, + "loss": 0.84584105, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.30688477, + "step": 1215, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098575, + "balance_loss_mlp": 1.06662679, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.05495395288746111, + "language_loss": 0.84313607, + "learning_rate": 0.0008948520825775634, + "loss": 0.85412186, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.31933594, + "step": 1216, + "time_per_iteration": 3.6454994678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099032, + "balance_loss_mlp": 1.06782317, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06066187191945671, + "language_loss": 0.83935732, + "learning_rate": 0.0008946608785231067, + "loss": 0.85034764, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.31176758, + "step": 1217, + "time_per_iteration": 2.9157872200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098088, + "balance_loss_mlp": 1.06599677, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.058216777953853424, + "language_loss": 0.84654021, + "learning_rate": 0.0008944695212477084, + "loss": 0.85752106, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.32080078, + "step": 1218, + "time_per_iteration": 2.473067045211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103602, + "balance_loss_mlp": 1.07158232, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.06075167680795146, + "language_loss": 0.86133409, + "learning_rate": 0.0008942780108256599, + "loss": 0.87237012, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.32006836, + "step": 1219, + "time_per_iteration": 2.581594705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100646, + "balance_loss_mlp": 1.06819737, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.07971641299609675, + "language_loss": 0.86269408, + "learning_rate": 0.0008940863473313121, + "loss": 0.87370056, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.32446289, + "step": 1220, + "time_per_iteration": 2.453798532485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108448, + "balance_loss_mlp": 1.0764761, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.07248436265958902, + "language_loss": 0.87226778, + "learning_rate": 0.0008938945308390756, + "loss": 0.88335222, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.31958008, + "step": 1221, + "time_per_iteration": 2.6299164295196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092799, + "balance_loss_mlp": 1.06099391, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.0746326386118845, + "language_loss": 0.86801684, + "learning_rate": 0.00089370256142342, + "loss": 0.87894481, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.31787109, + "step": 1222, + "time_per_iteration": 2.7373716831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099884, + "balance_loss_mlp": 1.0675782, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.06792905088784162, + "language_loss": 0.84961808, + "learning_rate": 0.0008935104391588746, + "loss": 0.86061692, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.32299805, + "step": 1223, + "time_per_iteration": 2.786801338195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.06850326, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.053660170998325075, + "language_loss": 0.8281433, + "learning_rate": 0.0008933181641200276, + "loss": 0.83915687, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.32861328, + "step": 1224, + "time_per_iteration": 3.1502432823181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102432, + "balance_loss_mlp": 1.06948209, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06465671729424353, + "language_loss": 0.85675979, + "learning_rate": 0.0008931257363815271, + "loss": 0.86778408, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.32958984, + "step": 1225, + "time_per_iteration": 2.9370880126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110561, + "balance_loss_mlp": 1.07370961, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.07282820073226746, + "language_loss": 0.89753437, + "learning_rate": 0.0008929331560180798, + "loss": 0.9085905, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.31884766, + "step": 1226, + "time_per_iteration": 2.977869749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122954, + "balance_loss_mlp": 1.09045768, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.053569811561680475, + "language_loss": 0.90818799, + "learning_rate": 0.0008927404231044525, + "loss": 0.91941756, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.32495117, + "step": 1227, + "time_per_iteration": 2.683979034423828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111641, + "balance_loss_mlp": 1.07909656, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.06109587035495086, + "language_loss": 0.81612283, + "learning_rate": 0.0008925475377154703, + "loss": 0.82723922, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.32543945, + "step": 1228, + "time_per_iteration": 2.734614610671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119771, + "balance_loss_mlp": 1.08577275, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.06451716518904643, + "language_loss": 0.82344091, + "learning_rate": 0.0008923544999260183, + "loss": 0.83463866, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.34033203, + "step": 1229, + "time_per_iteration": 2.740309000015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108587, + "balance_loss_mlp": 1.07561386, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.0665465772726836, + "language_loss": 0.91460836, + "learning_rate": 0.00089216130981104, + "loss": 0.92569423, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.32983398, + "step": 1230, + "time_per_iteration": 3.1343088150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103991, + "balance_loss_mlp": 1.07120848, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.061759964990198334, + "language_loss": 0.81970417, + "learning_rate": 0.000891967967445539, + "loss": 0.83074409, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.32788086, + "step": 1231, + "time_per_iteration": 2.67669677734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100144, + "balance_loss_mlp": 1.06829166, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04660382532121484, + "language_loss": 0.88927996, + "learning_rate": 0.0008917744729045772, + "loss": 0.90028143, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.31835938, + "step": 1232, + "time_per_iteration": 2.87488055229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098328, + "balance_loss_mlp": 1.06695223, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.054845027384176535, + "language_loss": 0.83439517, + "learning_rate": 0.0008915808262632757, + "loss": 0.84537846, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.31347656, + "step": 1233, + "time_per_iteration": 2.884615659713745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111142, + "balance_loss_mlp": 1.0800519, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.058607558308664987, + "language_loss": 0.93242431, + "learning_rate": 0.0008913870275968148, + "loss": 0.94353569, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.31054688, + "step": 1234, + "time_per_iteration": 2.7355458736419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.07740974, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.0661901036623414, + "language_loss": 0.87537754, + "learning_rate": 0.0008911930769804342, + "loss": 0.88646448, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.3125, + "step": 1235, + "time_per_iteration": 3.247985363006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_mlp": 1.08396649, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.053926277509791044, + "language_loss": 0.90842855, + "learning_rate": 0.0008909989744894318, + "loss": 0.91957957, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.31103516, + "step": 1236, + "time_per_iteration": 2.8457424640655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116546, + "balance_loss_mlp": 1.08598089, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.07410834458794652, + "language_loss": 0.81166267, + "learning_rate": 0.0008908047201991649, + "loss": 0.82282805, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.30517578, + "step": 1237, + "time_per_iteration": 2.743232011795044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_mlp": 1.07218719, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.0897055957170317, + "language_loss": 0.8615526, + "learning_rate": 0.0008906103141850502, + "loss": 0.87258613, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.3112793, + "step": 1238, + "time_per_iteration": 2.8931751251220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103231, + "balance_loss_mlp": 1.07164085, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.0595559706342315, + "language_loss": 0.87583494, + "learning_rate": 0.0008904157565225621, + "loss": 0.88686728, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.31567383, + "step": 1239, + "time_per_iteration": 2.681567430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096601, + "balance_loss_mlp": 1.06546402, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07926394914951292, + "language_loss": 0.81636947, + "learning_rate": 0.000890221047287235, + "loss": 0.82733548, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.31103516, + "step": 1240, + "time_per_iteration": 3.5042829513549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096214, + "balance_loss_mlp": 1.06450391, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.06383986480013222, + "language_loss": 0.90398014, + "learning_rate": 0.0008900261865546615, + "loss": 0.91494226, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.31689453, + "step": 1241, + "time_per_iteration": 2.656243324279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098327, + "balance_loss_mlp": 1.06533027, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.07463092576288201, + "language_loss": 0.84907639, + "learning_rate": 0.0008898311744004936, + "loss": 0.86005968, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.33007812, + "step": 1242, + "time_per_iteration": 2.7337045669555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089408, + "balance_loss_mlp": 1.05583906, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.057670085451747476, + "language_loss": 0.86718595, + "learning_rate": 0.0008896360109004414, + "loss": 0.87808001, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.3359375, + "step": 1243, + "time_per_iteration": 2.6334750652313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090579, + "balance_loss_mlp": 1.05667567, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.055695642571784755, + "language_loss": 0.84363699, + "learning_rate": 0.0008894406961302742, + "loss": 0.85454273, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.33935547, + "step": 1244, + "time_per_iteration": 2.612278699874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092282, + "balance_loss_mlp": 1.05840266, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.053835846346086756, + "language_loss": 0.83682489, + "learning_rate": 0.0008892452301658201, + "loss": 0.84774774, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.33911133, + "step": 1245, + "time_per_iteration": 2.999476432800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095498, + "balance_loss_mlp": 1.06169045, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.07830491582761978, + "language_loss": 0.83242297, + "learning_rate": 0.0008890496130829653, + "loss": 0.84337801, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.33837891, + "step": 1246, + "time_per_iteration": 2.6750991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093391, + "balance_loss_mlp": 1.05913019, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.06104300334873528, + "language_loss": 0.85340333, + "learning_rate": 0.0008888538449576555, + "loss": 0.86433721, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.34301758, + "step": 1247, + "time_per_iteration": 2.5646800994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095388, + "balance_loss_mlp": 1.06131816, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.05789610317969602, + "language_loss": 0.82348001, + "learning_rate": 0.0008886579258658944, + "loss": 0.83443391, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.34082031, + "step": 1248, + "time_per_iteration": 2.562016487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.05283499, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.05381401206887855, + "language_loss": 0.84731787, + "learning_rate": 0.0008884618558837446, + "loss": 0.85818857, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.34277344, + "step": 1249, + "time_per_iteration": 2.8163750171661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093014, + "balance_loss_mlp": 1.05927801, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.06053052424994898, + "language_loss": 0.86413568, + "learning_rate": 0.0008882656350873273, + "loss": 0.8750658, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.33764648, + "step": 1250, + "time_per_iteration": 2.844723701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088368, + "balance_loss_mlp": 1.05546594, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.06849099956300345, + "language_loss": 0.87088066, + "learning_rate": 0.0008880692635528219, + "loss": 0.88176429, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.32910156, + "step": 1251, + "time_per_iteration": 3.0528526306152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.048823, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.06290905233547327, + "language_loss": 0.88876319, + "learning_rate": 0.0008878727413564669, + "loss": 0.89957213, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.32055664, + "step": 1252, + "time_per_iteration": 2.758507251739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077691, + "balance_loss_mlp": 1.05194211, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.04466256972049361, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81213295, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.2578125, + "step": 1253, + "time_per_iteration": 4.847649097442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05616474, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.059681429897919615, + "language_loss": 0.78408957, + "learning_rate": 0.0008874792452834528, + "loss": 0.79497254, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.32128906, + "step": 1254, + "time_per_iteration": 2.754746198654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06061172, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.07362958371245172, + "language_loss": 0.87187612, + "learning_rate": 0.0008872822715595626, + "loss": 0.88279426, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.31176758, + "step": 1255, + "time_per_iteration": 2.662929058074951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109326, + "balance_loss_mlp": 1.06200314, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.08064600620778418, + "language_loss": 0.86789644, + "learning_rate": 0.0008870851474793598, + "loss": 0.87882906, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.31225586, + "step": 1256, + "time_per_iteration": 2.550830841064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096341, + "balance_loss_mlp": 1.06434524, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.05836545436632832, + "language_loss": 0.89218223, + "learning_rate": 0.0008868878731193752, + "loss": 0.90314561, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.31982422, + "step": 1257, + "time_per_iteration": 2.850184440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095001, + "balance_loss_mlp": 1.06400657, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.05536217997614851, + "language_loss": 0.89056414, + "learning_rate": 0.0008866904485561973, + "loss": 0.90151417, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.30957031, + "step": 1258, + "time_per_iteration": 2.7176461219787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107248, + "balance_loss_mlp": 1.0765636, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.0620425495695956, + "language_loss": 0.82697642, + "learning_rate": 0.000886492873866473, + "loss": 0.83804893, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.30639648, + "step": 1259, + "time_per_iteration": 2.881246328353882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_mlp": 1.07631803, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.0764912621319216, + "language_loss": 0.84458697, + "learning_rate": 0.000886295149126908, + "loss": 0.85565412, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.3034668, + "step": 1260, + "time_per_iteration": 2.711789846420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102095, + "balance_loss_mlp": 1.07148254, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05050860424869067, + "language_loss": 0.85437667, + "learning_rate": 0.0008860972744142655, + "loss": 0.86539763, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.30566406, + "step": 1261, + "time_per_iteration": 2.924192190170288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_mlp": 1.07146263, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.05198228858732316, + "language_loss": 0.81767958, + "learning_rate": 0.0008858992498053671, + "loss": 0.82869458, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.30004883, + "step": 1262, + "time_per_iteration": 2.8300395011901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069733, + "balance_loss_mlp": 1.04455626, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.04093384265265131, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77658486, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.25195312, + "step": 1263, + "time_per_iteration": 4.837641716003418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103166, + "balance_loss_mlp": 1.07217157, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05948216339756903, + "language_loss": 0.83247912, + "learning_rate": 0.0008855027512063817, + "loss": 0.84351087, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.30957031, + "step": 1264, + "time_per_iteration": 2.7277276515960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102812, + "balance_loss_mlp": 1.07191277, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06194442365761257, + "language_loss": 0.8589493, + "learning_rate": 0.0008853042773702292, + "loss": 0.86997747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.30859375, + "step": 1265, + "time_per_iteration": 2.7305567264556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103281, + "balance_loss_mlp": 1.07197642, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.0568893751116151, + "language_loss": 0.87145638, + "learning_rate": 0.0008851056539456896, + "loss": 0.88248914, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.31274414, + "step": 1266, + "time_per_iteration": 2.6886072158813477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.06767774, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.06669847345827673, + "language_loss": 0.81623918, + "learning_rate": 0.0008849068810098755, + "loss": 0.82723451, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.31835938, + "step": 1267, + "time_per_iteration": 3.302135705947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092019, + "balance_loss_mlp": 1.06049967, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.06302829877877653, + "language_loss": 0.82764143, + "learning_rate": 0.0008847079586399575, + "loss": 0.83856159, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.31494141, + "step": 1268, + "time_per_iteration": 2.469602584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.05755162, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.062034835544456234, + "language_loss": 0.85665154, + "learning_rate": 0.0008845088869131641, + "loss": 0.86753917, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.31176758, + "step": 1269, + "time_per_iteration": 2.6822941303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090407, + "balance_loss_mlp": 1.05864954, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.06778965234687388, + "language_loss": 0.88905638, + "learning_rate": 0.0008843096659067818, + "loss": 0.8999604, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.31738281, + "step": 1270, + "time_per_iteration": 2.594064235687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087142, + "balance_loss_mlp": 1.05555153, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05697237066827103, + "language_loss": 0.85987377, + "learning_rate": 0.000884110295698155, + "loss": 0.87074518, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.31567383, + "step": 1271, + "time_per_iteration": 2.974696636199951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.0512805, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.06068289501227115, + "language_loss": 0.85902673, + "learning_rate": 0.0008839107763646861, + "loss": 0.86986518, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.32568359, + "step": 1272, + "time_per_iteration": 2.607771158218384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085636, + "balance_loss_mlp": 1.0507555, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.061464799303267155, + "language_loss": 0.9008882, + "learning_rate": 0.0008837111079838353, + "loss": 0.91174459, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.34912109, + "step": 1273, + "time_per_iteration": 2.708512306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107918, + "balance_loss_mlp": 1.0463264, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.06335862765515422, + "language_loss": 0.89847112, + "learning_rate": 0.000883511290633121, + "loss": 0.9092629, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.32861328, + "step": 1274, + "time_per_iteration": 2.5415730476379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077904, + "balance_loss_mlp": 1.04423904, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04937694398035677, + "language_loss": 0.92408085, + "learning_rate": 0.000883311324390119, + "loss": 0.93485993, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.33691406, + "step": 1275, + "time_per_iteration": 2.734423875808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_mlp": 1.0457077, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.07292672859625873, + "language_loss": 0.80929816, + "learning_rate": 0.0008831112093324629, + "loss": 0.82010162, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.34667969, + "step": 1276, + "time_per_iteration": 3.0507287979125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076455, + "balance_loss_mlp": 1.04209912, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0707858001482728, + "language_loss": 0.88982868, + "learning_rate": 0.0008829109455378444, + "loss": 0.90059322, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.34375, + "step": 1277, + "time_per_iteration": 2.6684513092041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076963, + "balance_loss_mlp": 1.04284549, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05561589900472309, + "language_loss": 0.86233819, + "learning_rate": 0.000882710533084013, + "loss": 0.87310779, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.34155273, + "step": 1278, + "time_per_iteration": 2.623353958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074564, + "balance_loss_mlp": 1.04013681, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04936271772538766, + "language_loss": 0.89139968, + "learning_rate": 0.0008825099720487755, + "loss": 0.90214527, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.34448242, + "step": 1279, + "time_per_iteration": 2.6549813747406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069233, + "balance_loss_mlp": 1.04853857, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.028817901818472227, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76330376, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.20703125, + "step": 1280, + "time_per_iteration": 4.85357141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066002, + "balance_loss_mlp": 1.04521215, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.026145975527968417, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79010111, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.20800781, + "step": 1281, + "time_per_iteration": 4.780989408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083115, + "balance_loss_mlp": 1.04983163, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.06975718656823436, + "language_loss": 0.89050984, + "learning_rate": 0.0008819073982335619, + "loss": 0.90134096, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.33300781, + "step": 1282, + "time_per_iteration": 2.8345205783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05361331, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.062337694406813374, + "language_loss": 0.84269708, + "learning_rate": 0.0008817062436519235, + "loss": 0.85355437, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.32104492, + "step": 1283, + "time_per_iteration": 2.6846866607666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089504, + "balance_loss_mlp": 1.05612516, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.06365108043104846, + "language_loss": 0.89943874, + "learning_rate": 0.0008815049408787788, + "loss": 0.91033375, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.33398438, + "step": 1284, + "time_per_iteration": 2.5116872787475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081689, + "balance_loss_mlp": 1.04916823, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.059551230096427064, + "language_loss": 0.85302055, + "learning_rate": 0.0008813034899922805, + "loss": 0.86383736, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.32519531, + "step": 1285, + "time_per_iteration": 2.5286993980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080955, + "balance_loss_mlp": 1.04931688, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06660544793665324, + "language_loss": 0.89506048, + "learning_rate": 0.0008811018910706387, + "loss": 0.90586996, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.31616211, + "step": 1286, + "time_per_iteration": 2.552616834640503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079345, + "balance_loss_mlp": 1.04756403, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.07038813341767636, + "language_loss": 0.81879961, + "learning_rate": 0.0008809001441921211, + "loss": 0.82959306, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.31762695, + "step": 1287, + "time_per_iteration": 2.704249143600464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082412, + "balance_loss_mlp": 1.05132163, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.054805193397824324, + "language_loss": 0.85345185, + "learning_rate": 0.0008806982494350528, + "loss": 0.86427593, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.31054688, + "step": 1288, + "time_per_iteration": 2.65993070602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084976, + "balance_loss_mlp": 1.05359983, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.05430799794632807, + "language_loss": 0.90285796, + "learning_rate": 0.0008804962068778161, + "loss": 0.91370773, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.31347656, + "step": 1289, + "time_per_iteration": 2.8633711338043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086217, + "balance_loss_mlp": 1.05515075, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.06485439157304855, + "language_loss": 0.81069577, + "learning_rate": 0.0008802940165988511, + "loss": 0.82155788, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.31030273, + "step": 1290, + "time_per_iteration": 2.877063274383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084315, + "balance_loss_mlp": 1.05341625, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.058113292585204916, + "language_loss": 0.88358063, + "learning_rate": 0.000880091678676655, + "loss": 0.89442384, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.30859375, + "step": 1291, + "time_per_iteration": 2.800182342529297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088307, + "balance_loss_mlp": 1.05814719, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.05744202885681841, + "language_loss": 0.88709044, + "learning_rate": 0.0008798891931897821, + "loss": 0.89797354, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.30126953, + "step": 1292, + "time_per_iteration": 2.8186981678009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090758, + "balance_loss_mlp": 1.06009781, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.06335011869227863, + "language_loss": 0.84085584, + "learning_rate": 0.0008796865602168447, + "loss": 0.85176343, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.30615234, + "step": 1293, + "time_per_iteration": 2.5642354488372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093183, + "balance_loss_mlp": 1.06218874, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.055204532335327836, + "language_loss": 0.88449144, + "learning_rate": 0.0008794837798365115, + "loss": 0.89542329, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.30957031, + "step": 1294, + "time_per_iteration": 2.640967607498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103465, + "balance_loss_mlp": 1.07256651, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05342912575045942, + "language_loss": 0.88282919, + "learning_rate": 0.0008792808521275089, + "loss": 0.8938638, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.30859375, + "step": 1295, + "time_per_iteration": 2.743216037750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106969, + "balance_loss_mlp": 1.07638037, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.05542201073335728, + "language_loss": 0.87427896, + "learning_rate": 0.0008790777771686206, + "loss": 0.88534868, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.30541992, + "step": 1296, + "time_per_iteration": 2.5764553546905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109723, + "balance_loss_mlp": 1.07934809, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.061211557913471215, + "language_loss": 0.85332036, + "learning_rate": 0.0008788745550386872, + "loss": 0.86441755, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.30322266, + "step": 1297, + "time_per_iteration": 2.635064125061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111335, + "balance_loss_mlp": 1.08226037, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.055423812451341224, + "language_loss": 0.79893327, + "learning_rate": 0.0008786711858166063, + "loss": 0.81006682, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.31054688, + "step": 1298, + "time_per_iteration": 3.002070903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113917, + "balance_loss_mlp": 1.08387578, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.06342841372026603, + "language_loss": 0.8358891, + "learning_rate": 0.0008784676695813332, + "loss": 0.84702826, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.29980469, + "step": 1299, + "time_per_iteration": 2.941793918609619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116177, + "balance_loss_mlp": 1.08573055, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.05313888632052142, + "language_loss": 0.84205985, + "learning_rate": 0.0008782640064118796, + "loss": 0.85322165, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.30395508, + "step": 1300, + "time_per_iteration": 2.9038445949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113921, + "balance_loss_mlp": 1.11441469, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.03742785755303804, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77323961, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.24804688, + "step": 1301, + "time_per_iteration": 4.97193169593811 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108076, + "balance_loss_mlp": 1.0781548, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.06725713094725487, + "language_loss": 0.86707664, + "learning_rate": 0.0008778562395867648, + "loss": 0.87815738, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.29882812, + "step": 1302, + "time_per_iteration": 2.6434335708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109494, + "balance_loss_mlp": 1.064852, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.0573305289073435, + "language_loss": 0.83713615, + "learning_rate": 0.0008776521360894127, + "loss": 0.84808552, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.30029297, + "step": 1303, + "time_per_iteration": 2.664281129837036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087429, + "balance_loss_mlp": 1.06206167, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.030879512397293623, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80049491, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.25390625, + "step": 1304, + "time_per_iteration": 4.7838218212127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096771, + "balance_loss_mlp": 1.06682515, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.05889583885024225, + "language_loss": 0.90380585, + "learning_rate": 0.0008772434893213186, + "loss": 0.91477358, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.29882812, + "step": 1305, + "time_per_iteration": 2.619591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092992, + "balance_loss_mlp": 1.06228364, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.05643683756415757, + "language_loss": 0.84055364, + "learning_rate": 0.0008770389462092276, + "loss": 0.85148358, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.30664062, + "step": 1306, + "time_per_iteration": 2.646378517150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090271, + "balance_loss_mlp": 1.05860949, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.07421628365380602, + "language_loss": 0.86343837, + "learning_rate": 0.0008768342567176357, + "loss": 0.87434107, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.31640625, + "step": 1307, + "time_per_iteration": 2.807349681854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089217, + "balance_loss_mlp": 1.0562675, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.06024308313144323, + "language_loss": 0.90521109, + "learning_rate": 0.0008766294209260107, + "loss": 0.91610324, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.32958984, + "step": 1308, + "time_per_iteration": 2.652209758758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087002, + "balance_loss_mlp": 1.05510211, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.07044022402077256, + "language_loss": 0.90948963, + "learning_rate": 0.0008764244389138767, + "loss": 0.92035961, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.31884766, + "step": 1309, + "time_per_iteration": 2.583214044570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05386305, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.07007920023055086, + "language_loss": 0.82157373, + "learning_rate": 0.000876219310760815, + "loss": 0.83244258, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.33032227, + "step": 1310, + "time_per_iteration": 2.8652145862579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010922, + "balance_loss_mlp": 1.05956042, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05921747328918915, + "language_loss": 0.81032491, + "learning_rate": 0.0008760140365464631, + "loss": 0.82124686, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.32641602, + "step": 1311, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05799365, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06933033432447253, + "language_loss": 0.87204492, + "learning_rate": 0.0008758086163505156, + "loss": 0.88295335, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.32861328, + "step": 1312, + "time_per_iteration": 2.5809056758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085438, + "balance_loss_mlp": 1.05253649, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.05785086559723577, + "language_loss": 0.89221275, + "learning_rate": 0.0008756030502527239, + "loss": 0.90306717, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.32910156, + "step": 1313, + "time_per_iteration": 2.8305885791778564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084056, + "balance_loss_mlp": 1.05201209, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05540107069612798, + "language_loss": 0.90540659, + "learning_rate": 0.0008753973383328954, + "loss": 0.91624713, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.3203125, + "step": 1314, + "time_per_iteration": 2.8095338344573975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084266, + "balance_loss_mlp": 1.0518887, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.06960735937341114, + "language_loss": 0.83534479, + "learning_rate": 0.0008751914806708952, + "loss": 0.84618747, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.32373047, + "step": 1315, + "time_per_iteration": 2.6356046199798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084811, + "balance_loss_mlp": 1.05357838, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.05966295966929829, + "language_loss": 0.82178831, + "learning_rate": 0.0008749854773466439, + "loss": 0.83263648, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.31201172, + "step": 1316, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083614, + "balance_loss_mlp": 1.05199969, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.060440864571565875, + "language_loss": 0.84378719, + "learning_rate": 0.0008747793284401192, + "loss": 0.85462332, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.31591797, + "step": 1317, + "time_per_iteration": 2.672581195831299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04701352, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.06760844062466466, + "language_loss": 0.85858786, + "learning_rate": 0.0008745730340313551, + "loss": 0.8693741, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.31591797, + "step": 1318, + "time_per_iteration": 2.7483184337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088603, + "balance_loss_mlp": 1.05775118, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.06356165501521222, + "language_loss": 0.84280074, + "learning_rate": 0.0008743665942004422, + "loss": 0.85368681, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.30834961, + "step": 1319, + "time_per_iteration": 2.659477472305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094218, + "balance_loss_mlp": 1.06362879, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.06511177952096096, + "language_loss": 0.92719352, + "learning_rate": 0.0008741600090275277, + "loss": 0.93813574, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.30541992, + "step": 1320, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088577, + "balance_loss_mlp": 1.05758274, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.06459884228420558, + "language_loss": 0.84290528, + "learning_rate": 0.0008739532785928151, + "loss": 0.853791, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.30957031, + "step": 1321, + "time_per_iteration": 3.438142776489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166929, + "balance_loss_mlp": 1.14528096, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.062216562760273944, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.7606051, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.21679688, + "step": 1322, + "time_per_iteration": 4.881207466125488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109523, + "balance_loss_mlp": 1.06502271, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.0660267567978659, + "language_loss": 0.8296389, + "learning_rate": 0.0008735393822590908, + "loss": 0.84059119, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.30151367, + "step": 1323, + "time_per_iteration": 2.7254581451416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097272, + "balance_loss_mlp": 1.06723142, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.07409821223339019, + "language_loss": 0.87412238, + "learning_rate": 0.0008733322165207681, + "loss": 0.88509512, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.30029297, + "step": 1324, + "time_per_iteration": 2.6910648345947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102689, + "balance_loss_mlp": 1.07295775, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.06686348955430095, + "language_loss": 0.83012944, + "learning_rate": 0.0008731249058420247, + "loss": 0.84115636, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.29663086, + "step": 1325, + "time_per_iteration": 3.0301432609558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105708, + "balance_loss_mlp": 1.07499993, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.057218587703981125, + "language_loss": 0.90547103, + "learning_rate": 0.0008729174503033459, + "loss": 0.91652811, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.30664062, + "step": 1326, + "time_per_iteration": 2.668544292449951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107604, + "balance_loss_mlp": 1.07706285, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.08872727493885958, + "language_loss": 0.82430828, + "learning_rate": 0.0008727098499852728, + "loss": 0.83538437, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.30493164, + "step": 1327, + "time_per_iteration": 2.8206427097320557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102439, + "balance_loss_mlp": 1.07175469, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.05995612334517853, + "language_loss": 0.8945381, + "learning_rate": 0.0008725021049684034, + "loss": 0.90556252, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.30639648, + "step": 1328, + "time_per_iteration": 2.7788021564483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110018, + "balance_loss_mlp": 1.06906641, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.07693053452424695, + "language_loss": 0.82675111, + "learning_rate": 0.000872294215333391, + "loss": 0.83775294, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.31079102, + "step": 1329, + "time_per_iteration": 3.208423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089607, + "balance_loss_mlp": 1.05820751, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05833009001407562, + "language_loss": 0.83099753, + "learning_rate": 0.0008720861811609457, + "loss": 0.84189361, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.3137207, + "step": 1330, + "time_per_iteration": 2.723451614379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082701, + "balance_loss_mlp": 1.05122948, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.06841234134213905, + "language_loss": 0.83759737, + "learning_rate": 0.0008718780025318338, + "loss": 0.84842432, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.31445312, + "step": 1331, + "time_per_iteration": 2.7594637870788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_mlp": 1.05244088, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.059488371229756976, + "language_loss": 0.83890998, + "learning_rate": 0.0008716696795268771, + "loss": 0.84975058, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.31591797, + "step": 1332, + "time_per_iteration": 2.719435453414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086447, + "balance_loss_mlp": 1.05516648, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.09040651922247907, + "language_loss": 0.85621184, + "learning_rate": 0.0008714612122269538, + "loss": 0.86707628, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.3125, + "step": 1333, + "time_per_iteration": 2.846071481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087221, + "balance_loss_mlp": 1.05517721, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.06079891504044088, + "language_loss": 0.8881824, + "learning_rate": 0.0008712526007129982, + "loss": 0.89905459, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.3203125, + "step": 1334, + "time_per_iteration": 2.5539238452911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084143, + "balance_loss_mlp": 1.05226636, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06135189476637687, + "language_loss": 0.90600282, + "learning_rate": 0.0008710438450660003, + "loss": 0.91684425, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.31835938, + "step": 1335, + "time_per_iteration": 2.6957638263702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081701, + "balance_loss_mlp": 1.04984844, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.09152684925001835, + "language_loss": 0.86861122, + "learning_rate": 0.0008708349453670064, + "loss": 0.87942821, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.31835938, + "step": 1336, + "time_per_iteration": 2.569918632507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080209, + "balance_loss_mlp": 1.04854655, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.055029840901202824, + "language_loss": 0.91123867, + "learning_rate": 0.0008706259016971185, + "loss": 0.92204076, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.31640625, + "step": 1337, + "time_per_iteration": 2.7755186557769775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_mlp": 1.04554725, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.08019888390454845, + "language_loss": 0.82668757, + "learning_rate": 0.0008704167141374944, + "loss": 0.83746326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.32006836, + "step": 1338, + "time_per_iteration": 2.8559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073148, + "balance_loss_mlp": 1.04184318, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06412343972447931, + "language_loss": 0.88389909, + "learning_rate": 0.0008702073827693482, + "loss": 0.89463055, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.31274414, + "step": 1339, + "time_per_iteration": 2.725090265274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077981, + "balance_loss_mlp": 1.04662943, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06471871877048396, + "language_loss": 0.88798392, + "learning_rate": 0.0008699979076739494, + "loss": 0.89876378, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.31323242, + "step": 1340, + "time_per_iteration": 2.9663493633270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_mlp": 1.04354882, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.0844279622703065, + "language_loss": 0.88438749, + "learning_rate": 0.0008697882889326234, + "loss": 0.89513433, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.31103516, + "step": 1341, + "time_per_iteration": 2.5622262954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05047798, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.07114901487039385, + "language_loss": 0.86560714, + "learning_rate": 0.0008695785266267515, + "loss": 0.87642074, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.30834961, + "step": 1342, + "time_per_iteration": 2.7169957160949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083185, + "balance_loss_mlp": 1.05309629, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06303738321086937, + "language_loss": 0.82804394, + "learning_rate": 0.0008693686208377704, + "loss": 0.83887577, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.30053711, + "step": 1343, + "time_per_iteration": 2.8591935634613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090607, + "balance_loss_mlp": 1.06142426, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06465186244058573, + "language_loss": 0.88812125, + "learning_rate": 0.0008691585716471733, + "loss": 0.89902723, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.29150391, + "step": 1344, + "time_per_iteration": 2.6713430881500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099449, + "balance_loss_mlp": 1.07119632, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.0588719911399204, + "language_loss": 0.85261089, + "learning_rate": 0.0008689483791365079, + "loss": 0.86360538, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.28271484, + "step": 1345, + "time_per_iteration": 2.820528030395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112565, + "balance_loss_mlp": 1.08457518, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.06280839806958106, + "language_loss": 0.89176255, + "learning_rate": 0.0008687380433873786, + "loss": 0.90288818, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.28027344, + "step": 1346, + "time_per_iteration": 2.8161351680755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122151, + "balance_loss_mlp": 1.09442306, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.09019918884346267, + "language_loss": 0.82469404, + "learning_rate": 0.0008685275644814448, + "loss": 0.83591551, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.27734375, + "step": 1347, + "time_per_iteration": 2.693267822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_mlp": 1.09403384, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.0763626786758855, + "language_loss": 0.83996952, + "learning_rate": 0.0008683169425004216, + "loss": 0.85119361, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.28393555, + "step": 1348, + "time_per_iteration": 2.9267332553863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.07582057, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.0999879699530973, + "language_loss": 0.82942533, + "learning_rate": 0.0008681061775260799, + "loss": 0.84046841, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.28491211, + "step": 1349, + "time_per_iteration": 2.8389806747436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104623, + "balance_loss_mlp": 1.0761795, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06848449496170159, + "language_loss": 0.9182089, + "learning_rate": 0.0008678952696402458, + "loss": 0.92925513, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.28442383, + "step": 1350, + "time_per_iteration": 2.520573377609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091244, + "balance_loss_mlp": 1.06270587, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.06363942150358032, + "language_loss": 0.86753285, + "learning_rate": 0.000867684218924801, + "loss": 0.87844533, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.28564453, + "step": 1351, + "time_per_iteration": 2.9015109539031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094999, + "balance_loss_mlp": 1.07382762, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.03643594447100183, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80042088, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.21191406, + "step": 1352, + "time_per_iteration": 4.897913217544556 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05987692, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.05004222260192376, + "language_loss": 0.8488791, + "learning_rate": 0.0008672616893328834, + "loss": 0.85977256, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.29394531, + "step": 1353, + "time_per_iteration": 2.930330991744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089197, + "balance_loss_mlp": 1.05925155, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.06508424080641521, + "language_loss": 0.90170342, + "learning_rate": 0.0008670502106204512, + "loss": 0.91259539, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.29882812, + "step": 1354, + "time_per_iteration": 2.8581433296203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088042, + "balance_loss_mlp": 1.05821621, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.07357469643966064, + "language_loss": 0.81904948, + "learning_rate": 0.0008668385894064892, + "loss": 0.82992983, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.2980957, + "step": 1355, + "time_per_iteration": 2.6258199214935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086225, + "balance_loss_mlp": 1.05565977, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.05598612189883674, + "language_loss": 0.88435078, + "learning_rate": 0.0008666268257731562, + "loss": 0.89521307, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.30517578, + "step": 1356, + "time_per_iteration": 3.0935704708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_mlp": 1.06557548, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.05877228431721195, + "language_loss": 0.85582316, + "learning_rate": 0.0008664149198026662, + "loss": 0.86678505, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.3059082, + "step": 1357, + "time_per_iteration": 3.3150172233581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093826, + "balance_loss_mlp": 1.06407189, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.08010917030088013, + "language_loss": 0.88609982, + "learning_rate": 0.0008662028715772883, + "loss": 0.8970381, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.29736328, + "step": 1358, + "time_per_iteration": 2.652510166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117948, + "balance_loss_mlp": 1.08781219, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.068011575409632, + "language_loss": 0.8599565, + "learning_rate": 0.0008659906811793467, + "loss": 0.87113595, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.30078125, + "step": 1359, + "time_per_iteration": 2.6895272731781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120144, + "balance_loss_mlp": 1.08917356, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06541737550876531, + "language_loss": 0.89626461, + "learning_rate": 0.0008657783486912215, + "loss": 0.90746599, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.30932617, + "step": 1360, + "time_per_iteration": 2.762763738632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112864, + "balance_loss_mlp": 1.09752679, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.08393806981558949, + "language_loss": 0.89884281, + "learning_rate": 0.0008655658741953472, + "loss": 0.91012919, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.31079102, + "step": 1361, + "time_per_iteration": 3.2099156379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108189, + "balance_loss_mlp": 1.07740927, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.05266132623937494, + "language_loss": 0.88221049, + "learning_rate": 0.0008653532577742136, + "loss": 0.89329231, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.30761719, + "step": 1362, + "time_per_iteration": 2.6699323654174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097872, + "balance_loss_mlp": 1.06756878, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.06436829867728516, + "language_loss": 0.86740243, + "learning_rate": 0.0008651404995103659, + "loss": 0.87838113, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.30273438, + "step": 1363, + "time_per_iteration": 2.5310258865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094148, + "balance_loss_mlp": 1.06286716, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.05795299669830668, + "language_loss": 0.8642996, + "learning_rate": 0.0008649275994864041, + "loss": 0.87524116, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.3125, + "step": 1364, + "time_per_iteration": 2.675330638885498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_mlp": 1.07066512, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05147405231292679, + "language_loss": 0.83778602, + "learning_rate": 0.0008647145577849834, + "loss": 0.84880447, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.31152344, + "step": 1365, + "time_per_iteration": 2.817330837249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099582, + "balance_loss_mlp": 1.06913614, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.05119291352940178, + "language_loss": 0.82886052, + "learning_rate": 0.0008645013744888139, + "loss": 0.83985633, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.30395508, + "step": 1366, + "time_per_iteration": 2.9056894779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093325, + "balance_loss_mlp": 1.06318903, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.08887633390516779, + "language_loss": 0.8772788, + "learning_rate": 0.0008642880496806607, + "loss": 0.88821203, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.30102539, + "step": 1367, + "time_per_iteration": 2.8175759315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.0635649, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.0720053964715196, + "language_loss": 0.84128964, + "learning_rate": 0.0008640745834433437, + "loss": 0.85223687, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.3112793, + "step": 1368, + "time_per_iteration": 2.7703893184661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085473, + "balance_loss_mlp": 1.05559897, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.058958451803685384, + "language_loss": 0.86905044, + "learning_rate": 0.000863860975859738, + "loss": 0.87990516, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.29833984, + "step": 1369, + "time_per_iteration": 2.913543224334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093496, + "balance_loss_mlp": 1.06309724, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.07885033776141591, + "language_loss": 0.87845421, + "learning_rate": 0.0008636472270127733, + "loss": 0.8893891, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.3034668, + "step": 1370, + "time_per_iteration": 2.6615941524505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093443, + "balance_loss_mlp": 1.06368852, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.06686078076555955, + "language_loss": 0.90047085, + "learning_rate": 0.0008634333369854345, + "loss": 0.91140521, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.29736328, + "step": 1371, + "time_per_iteration": 2.611501932144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109652, + "balance_loss_mlp": 1.06666958, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05135890593758564, + "language_loss": 0.87519878, + "learning_rate": 0.0008632193058607608, + "loss": 0.88616395, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.29833984, + "step": 1372, + "time_per_iteration": 2.7420408725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096239, + "balance_loss_mlp": 1.06681848, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.07070265457366111, + "language_loss": 0.80896008, + "learning_rate": 0.0008630051337218466, + "loss": 0.81992251, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.29394531, + "step": 1373, + "time_per_iteration": 2.694157123565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097092, + "balance_loss_mlp": 1.06762338, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.06318549857397857, + "language_loss": 0.8188293, + "learning_rate": 0.0008627908206518409, + "loss": 0.82980019, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.29418945, + "step": 1374, + "time_per_iteration": 2.703380823135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_mlp": 1.00330341, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.017765090827900253, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76174676, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.20117188, + "step": 1375, + "time_per_iteration": 4.995063781738281 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092603, + "balance_loss_mlp": 1.06237197, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.0561933760173491, + "language_loss": 0.9114545, + "learning_rate": 0.0008623617720514241, + "loss": 0.92238057, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.30224609, + "step": 1376, + "time_per_iteration": 2.666578769683838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093572, + "balance_loss_mlp": 1.06276798, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.06268473823371516, + "language_loss": 0.84907627, + "learning_rate": 0.0008621470366875848, + "loss": 0.86001205, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.30761719, + "step": 1377, + "time_per_iteration": 2.576968193054199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087995, + "balance_loss_mlp": 1.05661869, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05801174228437736, + "language_loss": 0.87514544, + "learning_rate": 0.0008619321607257966, + "loss": 0.88602537, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.31347656, + "step": 1378, + "time_per_iteration": 2.6873912811279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05396187, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.06612008054140536, + "language_loss": 0.81601393, + "learning_rate": 0.000861717144249482, + "loss": 0.82685226, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.2980957, + "step": 1379, + "time_per_iteration": 2.861531972885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082319, + "balance_loss_mlp": 1.05220687, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06041061044303736, + "language_loss": 0.89415485, + "learning_rate": 0.0008615019873421175, + "loss": 0.90497804, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.30053711, + "step": 1380, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080185, + "balance_loss_mlp": 1.04973865, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.12029414194163875, + "language_loss": 0.85435975, + "learning_rate": 0.0008612866900872349, + "loss": 0.86516166, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.30395508, + "step": 1381, + "time_per_iteration": 2.5492422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078246, + "balance_loss_mlp": 1.0483005, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.06111803920627532, + "language_loss": 0.87957448, + "learning_rate": 0.0008610712525684197, + "loss": 0.89035696, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.29882812, + "step": 1382, + "time_per_iteration": 2.632847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083959, + "balance_loss_mlp": 1.05356061, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.07781171288722535, + "language_loss": 0.84130585, + "learning_rate": 0.0008608556748693121, + "loss": 0.85214543, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.3034668, + "step": 1383, + "time_per_iteration": 3.246919631958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108693, + "balance_loss_mlp": 1.05522013, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.052993237489823604, + "language_loss": 0.85963714, + "learning_rate": 0.000860639957073607, + "loss": 0.87050641, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.31689453, + "step": 1384, + "time_per_iteration": 2.7504889965057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086729, + "balance_loss_mlp": 1.05537665, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.06878538642870029, + "language_loss": 0.87610686, + "learning_rate": 0.0008604240992650534, + "loss": 0.88697416, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.31347656, + "step": 1385, + "time_per_iteration": 2.6546881198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082661, + "balance_loss_mlp": 1.05135679, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.05853696199287041, + "language_loss": 0.89197159, + "learning_rate": 0.0008602081015274545, + "loss": 0.90279818, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.31274414, + "step": 1386, + "time_per_iteration": 2.7526328563690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091919, + "balance_loss_mlp": 1.06061459, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.05264786586341277, + "language_loss": 0.83147365, + "learning_rate": 0.0008599919639446684, + "loss": 0.8423928, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.31274414, + "step": 1387, + "time_per_iteration": 2.6775026321411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093309, + "balance_loss_mlp": 1.06126583, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06747698326814106, + "language_loss": 0.79790741, + "learning_rate": 0.000859775686600607, + "loss": 0.80884051, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.3203125, + "step": 1388, + "time_per_iteration": 2.535508871078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090634, + "balance_loss_mlp": 1.05921042, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.06336986871451572, + "language_loss": 0.84764999, + "learning_rate": 0.0008595592695792367, + "loss": 0.85855639, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.31396484, + "step": 1389, + "time_per_iteration": 2.6549055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097148, + "balance_loss_mlp": 1.06593931, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.055901377362424544, + "language_loss": 0.90619266, + "learning_rate": 0.0008593427129645778, + "loss": 0.91716409, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.31176758, + "step": 1390, + "time_per_iteration": 2.6070477962493896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096463, + "balance_loss_mlp": 1.06542134, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.06788313950064188, + "language_loss": 0.85213327, + "learning_rate": 0.0008591260168407052, + "loss": 0.86309791, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.31005859, + "step": 1391, + "time_per_iteration": 2.794921398162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_mlp": 1.05963671, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.052723370404498295, + "language_loss": 0.82993329, + "learning_rate": 0.0008589091812917479, + "loss": 0.84085703, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.32739258, + "step": 1392, + "time_per_iteration": 2.634734869003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088674, + "balance_loss_mlp": 1.05727446, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.06846284491975779, + "language_loss": 0.85420829, + "learning_rate": 0.0008586922064018887, + "loss": 0.86509502, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.3137207, + "step": 1393, + "time_per_iteration": 2.662095308303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108591, + "balance_loss_mlp": 1.05408156, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.07721778370466406, + "language_loss": 0.89049023, + "learning_rate": 0.0008584750922553651, + "loss": 0.90134937, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.31811523, + "step": 1394, + "time_per_iteration": 3.15010666847229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_mlp": 1.05053067, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.054821616219537066, + "language_loss": 0.83275163, + "learning_rate": 0.0008582578389364677, + "loss": 0.8435728, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.31567383, + "step": 1395, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086932, + "balance_loss_mlp": 1.05469775, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.049938668546041676, + "language_loss": 0.91772366, + "learning_rate": 0.0008580404465295422, + "loss": 0.92859298, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.32226562, + "step": 1396, + "time_per_iteration": 2.8488125801086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079266, + "balance_loss_mlp": 1.04891562, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.06204428603549851, + "language_loss": 0.87966394, + "learning_rate": 0.0008578229151189876, + "loss": 0.89045662, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.30297852, + "step": 1397, + "time_per_iteration": 2.92258620262146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081241, + "balance_loss_mlp": 1.04867268, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.06429333021146523, + "language_loss": 0.81249309, + "learning_rate": 0.0008576052447892573, + "loss": 0.82330555, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.32568359, + "step": 1398, + "time_per_iteration": 2.551042318344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.05163908, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.0671833421183549, + "language_loss": 0.86040235, + "learning_rate": 0.000857387435624858, + "loss": 0.87124133, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.32250977, + "step": 1399, + "time_per_iteration": 2.5816056728363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086843, + "balance_loss_mlp": 1.05382252, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.05003222473195782, + "language_loss": 0.87953913, + "learning_rate": 0.0008571694877103513, + "loss": 0.89040762, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.33032227, + "step": 1400, + "time_per_iteration": 3.256469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108756, + "balance_loss_mlp": 1.05542135, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.056643414184275494, + "language_loss": 0.87665725, + "learning_rate": 0.0008569514011303515, + "loss": 0.88753277, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.32128906, + "step": 1401, + "time_per_iteration": 2.782273054122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084611, + "balance_loss_mlp": 1.05275857, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06127144796082157, + "language_loss": 0.8767277, + "learning_rate": 0.0008567331759695277, + "loss": 0.88757378, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.31835938, + "step": 1402, + "time_per_iteration": 2.696514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084545, + "balance_loss_mlp": 1.05178595, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.07491599518741582, + "language_loss": 0.86524475, + "learning_rate": 0.0008565148123126023, + "loss": 0.87609023, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.32763672, + "step": 1403, + "time_per_iteration": 2.6686785221099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088194, + "balance_loss_mlp": 1.05510116, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.050644669708274456, + "language_loss": 0.8574301, + "learning_rate": 0.0008562963102443516, + "loss": 0.86831206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.33105469, + "step": 1404, + "time_per_iteration": 2.693836212158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05232334, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.06951419199959312, + "language_loss": 0.84958577, + "learning_rate": 0.0008560776698496056, + "loss": 0.8604449, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.33618164, + "step": 1405, + "time_per_iteration": 2.892805814743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084222, + "balance_loss_mlp": 1.05093896, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.07287556066439085, + "language_loss": 0.85794389, + "learning_rate": 0.0008558588912132481, + "loss": 0.8687861, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.33300781, + "step": 1406, + "time_per_iteration": 2.821922540664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098005, + "balance_loss_mlp": 1.07587957, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.044578698770804955, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77556992, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.22167969, + "step": 1407, + "time_per_iteration": 4.952622652053833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082949, + "balance_loss_mlp": 1.05016637, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.05991157104862915, + "language_loss": 0.82959783, + "learning_rate": 0.0008554209195555016, + "loss": 0.84042734, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.32788086, + "step": 1408, + "time_per_iteration": 2.6937575340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085788, + "balance_loss_mlp": 1.05403042, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.06960051295953752, + "language_loss": 0.88047969, + "learning_rate": 0.0008552017267041483, + "loss": 0.89133757, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.31738281, + "step": 1409, + "time_per_iteration": 2.7926084995269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093218, + "balance_loss_mlp": 1.06134176, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.07424010893339522, + "language_loss": 0.8324914, + "learning_rate": 0.0008549823959512549, + "loss": 0.8434236, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.31860352, + "step": 1410, + "time_per_iteration": 2.660325050354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098331, + "balance_loss_mlp": 1.06724083, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.062062202361739795, + "language_loss": 0.86755967, + "learning_rate": 0.0008547629273819728, + "loss": 0.87854296, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.31054688, + "step": 1411, + "time_per_iteration": 3.3994545936584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098737, + "balance_loss_mlp": 1.06736147, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06335672358829844, + "language_loss": 0.83453959, + "learning_rate": 0.0008545433210815074, + "loss": 0.84552693, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.31347656, + "step": 1412, + "time_per_iteration": 2.644434690475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102874, + "balance_loss_mlp": 1.07123613, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.06340025797507488, + "language_loss": 0.87345338, + "learning_rate": 0.0008543235771351176, + "loss": 0.88448215, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.31616211, + "step": 1413, + "time_per_iteration": 2.7854721546173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098411, + "balance_loss_mlp": 1.0675596, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.05399278560092938, + "language_loss": 0.84545946, + "learning_rate": 0.0008541036956281154, + "loss": 0.85644352, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.30834961, + "step": 1414, + "time_per_iteration": 2.8788704872131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091962, + "balance_loss_mlp": 1.06056201, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.07883268546047513, + "language_loss": 0.81883514, + "learning_rate": 0.0008538836766458665, + "loss": 0.82975471, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.3137207, + "step": 1415, + "time_per_iteration": 2.8526153564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087599, + "balance_loss_mlp": 1.05732012, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.060849568603238105, + "language_loss": 0.84889638, + "learning_rate": 0.0008536635202737897, + "loss": 0.85977244, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.30224609, + "step": 1416, + "time_per_iteration": 2.837353467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089772, + "balance_loss_mlp": 1.05903983, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.07898075745209039, + "language_loss": 0.82057679, + "learning_rate": 0.0008534432265973573, + "loss": 0.83147448, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.30688477, + "step": 1417, + "time_per_iteration": 2.5948355197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091815, + "balance_loss_mlp": 1.05891299, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.06605458024108496, + "language_loss": 0.87714171, + "learning_rate": 0.000853222795702095, + "loss": 0.88805991, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.32910156, + "step": 1418, + "time_per_iteration": 3.4183547496795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109188, + "balance_loss_mlp": 1.05842948, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.04642939327926388, + "language_loss": 0.83471483, + "learning_rate": 0.0008530022276735813, + "loss": 0.84563363, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.33447266, + "step": 1419, + "time_per_iteration": 2.711695432662964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086506, + "balance_loss_mlp": 1.05293703, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.05938997521105461, + "language_loss": 0.85724676, + "learning_rate": 0.0008527815225974489, + "loss": 0.86811179, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.3359375, + "step": 1420, + "time_per_iteration": 2.648448944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086523, + "balance_loss_mlp": 1.05407453, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.07492898694353861, + "language_loss": 0.87982917, + "learning_rate": 0.0008525606805593829, + "loss": 0.89069438, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.32446289, + "step": 1421, + "time_per_iteration": 2.4182560443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082421, + "balance_loss_mlp": 1.04997277, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.06962089633364145, + "language_loss": 0.82760686, + "learning_rate": 0.0008523397016451213, + "loss": 0.83843112, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.32446289, + "step": 1422, + "time_per_iteration": 2.587892532348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05021799, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.053513553181154576, + "language_loss": 0.8711561, + "learning_rate": 0.0008521185859404564, + "loss": 0.88199091, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.33276367, + "step": 1423, + "time_per_iteration": 3.372192859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108477, + "balance_loss_mlp": 1.0513202, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.059986100163812936, + "language_loss": 0.89238524, + "learning_rate": 0.0008518973335312326, + "loss": 0.90323293, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.33447266, + "step": 1424, + "time_per_iteration": 2.791482448577881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082662, + "balance_loss_mlp": 1.04921198, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.06956472940992567, + "language_loss": 0.8333236, + "learning_rate": 0.0008516759445033477, + "loss": 0.84415025, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.3347168, + "step": 1425, + "time_per_iteration": 2.6889073848724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082757, + "balance_loss_mlp": 1.05088091, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.0615305422895171, + "language_loss": 0.84459686, + "learning_rate": 0.0008514544189427526, + "loss": 0.85542446, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.31860352, + "step": 1426, + "time_per_iteration": 2.797384738922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094312, + "balance_loss_mlp": 1.06143463, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061840511174045036, + "language_loss": 0.86558306, + "learning_rate": 0.0008512327569354511, + "loss": 0.87652624, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.32885742, + "step": 1427, + "time_per_iteration": 2.533623695373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06418157, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.06551541099381472, + "language_loss": 0.83328068, + "learning_rate": 0.0008510109585675001, + "loss": 0.84424412, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.3215332, + "step": 1428, + "time_per_iteration": 2.623915672302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10653293, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.06717437310459566, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82279044, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.19140625, + "step": 1429, + "time_per_iteration": 4.737167596817017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096832, + "balance_loss_mlp": 1.06517005, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.06718416370196487, + "language_loss": 0.80457842, + "learning_rate": 0.0008505669530941415, + "loss": 0.81554675, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.31640625, + "step": 1430, + "time_per_iteration": 3.380617141723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103498, + "balance_loss_mlp": 1.07169294, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.06498994038544256, + "language_loss": 0.83560073, + "learning_rate": 0.000850344746161112, + "loss": 0.8466357, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.31787109, + "step": 1431, + "time_per_iteration": 2.5917775630950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110093, + "balance_loss_mlp": 1.06883883, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.06649249705457211, + "language_loss": 0.87664711, + "learning_rate": 0.0008501224032121894, + "loss": 0.88765645, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.32080078, + "step": 1432, + "time_per_iteration": 2.493826150894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101828, + "balance_loss_mlp": 1.06906962, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.06530156063230687, + "language_loss": 0.8172394, + "learning_rate": 0.0008498999243336946, + "loss": 0.82825768, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.32763672, + "step": 1433, + "time_per_iteration": 2.625955581665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104254, + "balance_loss_mlp": 1.07275844, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.056445052388478564, + "language_loss": 0.87110436, + "learning_rate": 0.0008496773096120021, + "loss": 0.88214689, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.31469727, + "step": 1434, + "time_per_iteration": 2.8644402027130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093048, + "balance_loss_mlp": 1.06169593, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.07767765628739494, + "language_loss": 0.84306771, + "learning_rate": 0.0008494545591335381, + "loss": 0.85399818, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.31323242, + "step": 1435, + "time_per_iteration": 2.9069130420684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094657, + "balance_loss_mlp": 1.06366265, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04344696113506711, + "language_loss": 0.86938953, + "learning_rate": 0.0008492316729847823, + "loss": 0.88033605, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.30957031, + "step": 1436, + "time_per_iteration": 2.844926595687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091812, + "balance_loss_mlp": 1.06050754, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055139322891005815, + "language_loss": 0.79749823, + "learning_rate": 0.0008490086512522664, + "loss": 0.80841637, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.31274414, + "step": 1437, + "time_per_iteration": 2.722158670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092682, + "balance_loss_mlp": 1.06121063, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.06334111858493886, + "language_loss": 0.90728873, + "learning_rate": 0.0008487854940225755, + "loss": 0.91821557, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.31445312, + "step": 1438, + "time_per_iteration": 2.43622088432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091549, + "balance_loss_mlp": 1.05991077, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.05907133214000555, + "language_loss": 0.89962572, + "learning_rate": 0.0008485622013823466, + "loss": 0.91054124, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.31616211, + "step": 1439, + "time_per_iteration": 2.61773943901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093806, + "balance_loss_mlp": 1.06154847, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.06492331678063241, + "language_loss": 0.82635379, + "learning_rate": 0.00084833877341827, + "loss": 0.83729184, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.32250977, + "step": 1440, + "time_per_iteration": 2.625870704650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092721, + "balance_loss_mlp": 1.06139278, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.06674971698169922, + "language_loss": 0.80478823, + "learning_rate": 0.000848115210217088, + "loss": 0.81571543, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.31298828, + "step": 1441, + "time_per_iteration": 2.577364444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086558, + "balance_loss_mlp": 1.05410933, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.055312199129178424, + "language_loss": 0.81684244, + "learning_rate": 0.0008478915118655952, + "loss": 0.82770801, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.32446289, + "step": 1442, + "time_per_iteration": 2.714303493499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089692, + "balance_loss_mlp": 1.05710077, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.049794988647852687, + "language_loss": 0.86386287, + "learning_rate": 0.0008476676784506393, + "loss": 0.87475979, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.32592773, + "step": 1443, + "time_per_iteration": 2.68835711479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087996, + "balance_loss_mlp": 1.05664372, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.05900532389488003, + "language_loss": 0.82031631, + "learning_rate": 0.0008474437100591201, + "loss": 0.83119631, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.31323242, + "step": 1444, + "time_per_iteration": 3.3359997272491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084677, + "balance_loss_mlp": 1.05160809, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.054436577911169556, + "language_loss": 0.85231566, + "learning_rate": 0.0008472196067779898, + "loss": 0.86316246, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.33081055, + "step": 1445, + "time_per_iteration": 2.7946455478668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080884, + "balance_loss_mlp": 1.04850721, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.08667298623079295, + "language_loss": 0.85239732, + "learning_rate": 0.0008469953686942531, + "loss": 0.86320615, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.32373047, + "step": 1446, + "time_per_iteration": 3.0761613845825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081651, + "balance_loss_mlp": 1.04927349, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.07591437330096602, + "language_loss": 0.8283245, + "learning_rate": 0.0008467709958949668, + "loss": 0.83914101, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.32373047, + "step": 1447, + "time_per_iteration": 2.7922093868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082166, + "balance_loss_mlp": 1.0504328, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.0636917665663464, + "language_loss": 0.86192262, + "learning_rate": 0.0008465464884672403, + "loss": 0.8727442, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.31713867, + "step": 1448, + "time_per_iteration": 2.679574966430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108413, + "balance_loss_mlp": 1.05211091, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06494062959974968, + "language_loss": 0.85664314, + "learning_rate": 0.0008463218464982348, + "loss": 0.86748445, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.32006836, + "step": 1449, + "time_per_iteration": 2.8746044635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05524611, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.05859002353759583, + "language_loss": 0.87554371, + "learning_rate": 0.0008460970700751645, + "loss": 0.88640976, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.31323242, + "step": 1450, + "time_per_iteration": 3.0630292892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.05447531, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06644970008868617, + "language_loss": 0.8732717, + "learning_rate": 0.000845872159285295, + "loss": 0.8841247, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.30786133, + "step": 1451, + "time_per_iteration": 2.7334539890289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149095, + "balance_loss_mlp": 1.13173842, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.04059568749878616, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78915942, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.17382812, + "step": 1452, + "time_per_iteration": 4.913143634796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087672, + "balance_loss_mlp": 1.05617714, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05755695164820471, + "language_loss": 0.86085773, + "learning_rate": 0.0008454219349544836, + "loss": 0.87173438, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.31469727, + "step": 1453, + "time_per_iteration": 3.3649299144744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086718, + "balance_loss_mlp": 1.05569983, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.059728326526783365, + "language_loss": 0.8137995, + "learning_rate": 0.000845196621588334, + "loss": 0.82466674, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.30981445, + "step": 1454, + "time_per_iteration": 2.7774734497070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082042, + "balance_loss_mlp": 1.05095196, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.0559695634724148, + "language_loss": 0.76184201, + "learning_rate": 0.0008449711742049706, + "loss": 0.77266252, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.31054688, + "step": 1455, + "time_per_iteration": 2.75393009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107969, + "balance_loss_mlp": 1.04814696, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.06397369460964857, + "language_loss": 0.83309555, + "learning_rate": 0.0008447455928919196, + "loss": 0.84389246, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.31518555, + "step": 1456, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082481, + "balance_loss_mlp": 1.05177259, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.06274060179370718, + "language_loss": 0.86886203, + "learning_rate": 0.0008445198777367595, + "loss": 0.87968683, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.30664062, + "step": 1457, + "time_per_iteration": 2.6488282680511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089589, + "balance_loss_mlp": 1.05883336, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.06557026121847803, + "language_loss": 0.8106361, + "learning_rate": 0.0008442940288271208, + "loss": 0.82153201, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.30712891, + "step": 1458, + "time_per_iteration": 2.67258882522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096326, + "balance_loss_mlp": 1.06454456, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.07361561415976156, + "language_loss": 0.86939961, + "learning_rate": 0.0008440680462506856, + "loss": 0.88036287, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.31762695, + "step": 1459, + "time_per_iteration": 2.7335550785064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105041, + "balance_loss_mlp": 1.07354569, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.05419081251366802, + "language_loss": 0.86197531, + "learning_rate": 0.0008438419300951883, + "loss": 0.87302566, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.31469727, + "step": 1460, + "time_per_iteration": 2.6306796073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106375, + "balance_loss_mlp": 1.07459426, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.08520166677325354, + "language_loss": 0.8634038, + "learning_rate": 0.0008436156804484148, + "loss": 0.87446761, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.31762695, + "step": 1461, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110218, + "balance_loss_mlp": 1.0698266, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.06649626079325978, + "language_loss": 0.88025403, + "learning_rate": 0.0008433892973982031, + "loss": 0.89127588, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.32348633, + "step": 1462, + "time_per_iteration": 2.572810173034668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110576, + "balance_loss_mlp": 1.07333505, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06397092621415032, + "language_loss": 0.85030043, + "learning_rate": 0.0008431627810324431, + "loss": 0.86135799, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.32421875, + "step": 1463, + "time_per_iteration": 2.6855740547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_mlp": 1.0774579, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.06457367310459801, + "language_loss": 0.81006026, + "learning_rate": 0.000842936131439076, + "loss": 0.82115412, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.3190918, + "step": 1464, + "time_per_iteration": 2.5868756771087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102286, + "balance_loss_mlp": 1.07188725, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06483114531916107, + "language_loss": 0.87564301, + "learning_rate": 0.0008427093487060951, + "loss": 0.88666582, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.3034668, + "step": 1465, + "time_per_iteration": 2.6775078773498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.07294393, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05163652452488039, + "language_loss": 0.84608126, + "learning_rate": 0.000842482432921545, + "loss": 0.85712349, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.3125, + "step": 1466, + "time_per_iteration": 2.844379186630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090816, + "balance_loss_mlp": 1.05955911, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.05726454257462379, + "language_loss": 0.86823475, + "learning_rate": 0.0008422553841735225, + "loss": 0.87914288, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.31225586, + "step": 1467, + "time_per_iteration": 2.4838902950286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05624461, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.07863392491108157, + "language_loss": 0.8442952, + "learning_rate": 0.0008420282025501757, + "loss": 0.85516858, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.31054688, + "step": 1468, + "time_per_iteration": 2.7528913021087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108248, + "balance_loss_mlp": 1.05169988, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.056003117579575636, + "language_loss": 0.852718, + "learning_rate": 0.0008418008881397043, + "loss": 0.86354285, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.30737305, + "step": 1469, + "time_per_iteration": 2.6801319122314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078886, + "balance_loss_mlp": 1.0479157, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.04937894089719141, + "language_loss": 0.82587177, + "learning_rate": 0.0008415734410303595, + "loss": 0.83666062, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.30932617, + "step": 1470, + "time_per_iteration": 3.1880481243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04551327, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.053571151454841835, + "language_loss": 0.90790403, + "learning_rate": 0.0008413458613104444, + "loss": 0.91866791, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.30834961, + "step": 1471, + "time_per_iteration": 2.6801347732543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.04832768, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.054274543729309115, + "language_loss": 0.82964969, + "learning_rate": 0.0008411181490683129, + "loss": 0.84044528, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.31201172, + "step": 1472, + "time_per_iteration": 2.732304096221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107702, + "balance_loss_mlp": 1.04619205, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05901735675502878, + "language_loss": 0.82318664, + "learning_rate": 0.0008408903043923707, + "loss": 0.83395684, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.30786133, + "step": 1473, + "time_per_iteration": 3.0503528118133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079988, + "balance_loss_mlp": 1.04906487, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.06313039437285956, + "language_loss": 0.81015414, + "learning_rate": 0.0008406623273710754, + "loss": 0.82095402, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.30883789, + "step": 1474, + "time_per_iteration": 2.606189727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05008459, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06295911479055617, + "language_loss": 0.82597101, + "learning_rate": 0.0008404342180929351, + "loss": 0.83678609, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.31396484, + "step": 1475, + "time_per_iteration": 2.620607614517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_mlp": 1.04222226, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06425181584365489, + "language_loss": 0.81938702, + "learning_rate": 0.00084020597664651, + "loss": 0.83012277, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.31323242, + "step": 1476, + "time_per_iteration": 2.7725043296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083628, + "balance_loss_mlp": 1.05232406, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.06074887859321084, + "language_loss": 0.83907133, + "learning_rate": 0.0008399776031204111, + "loss": 0.84990764, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.31274414, + "step": 1477, + "time_per_iteration": 2.7300467491149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092258, + "balance_loss_mlp": 1.06081057, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.05838491012274946, + "language_loss": 0.80185568, + "learning_rate": 0.0008397490976033009, + "loss": 0.81277823, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.31420898, + "step": 1478, + "time_per_iteration": 2.650667905807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080543, + "balance_loss_mlp": 1.062042, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.03640521186287318, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78960192, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.18457031, + "step": 1479, + "time_per_iteration": 4.764774322509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108592, + "balance_loss_mlp": 1.07654858, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.05702144306517339, + "language_loss": 0.85150903, + "learning_rate": 0.0008392916909509525, + "loss": 0.86259496, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.3203125, + "step": 1480, + "time_per_iteration": 3.0437960624694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104226, + "balance_loss_mlp": 1.07289815, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.06780557774925215, + "language_loss": 0.84802043, + "learning_rate": 0.0008390627899932954, + "loss": 0.85906273, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.31298828, + "step": 1481, + "time_per_iteration": 2.596781015396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100766, + "balance_loss_mlp": 1.0693903, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.07875184362779108, + "language_loss": 0.88996881, + "learning_rate": 0.000838833757399789, + "loss": 0.90097642, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.31347656, + "step": 1482, + "time_per_iteration": 2.94795560836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.05274367, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.07597770471398792, + "language_loss": 0.80484587, + "learning_rate": 0.0008386045932593515, + "loss": 0.81568611, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.3125, + "step": 1483, + "time_per_iteration": 2.6795289516448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079675, + "balance_loss_mlp": 1.0484184, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.05859914190414705, + "language_loss": 0.86136287, + "learning_rate": 0.0008383752976609525, + "loss": 0.8721596, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.31225586, + "step": 1484, + "time_per_iteration": 2.900468349456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_mlp": 1.04878783, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.0559282187978278, + "language_loss": 0.80215633, + "learning_rate": 0.0008381458706936123, + "loss": 0.81296104, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.31665039, + "step": 1485, + "time_per_iteration": 2.6815216541290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081909, + "balance_loss_mlp": 1.05031872, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.06658109550051822, + "language_loss": 0.87213105, + "learning_rate": 0.0008379163124464025, + "loss": 0.88295019, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.31567383, + "step": 1486, + "time_per_iteration": 2.7246947288513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098145, + "balance_loss_mlp": 1.06572032, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.06266105362217729, + "language_loss": 0.76595891, + "learning_rate": 0.0008376866230084452, + "loss": 0.77694035, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.32421875, + "step": 1487, + "time_per_iteration": 2.8626444339752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102537, + "balance_loss_mlp": 1.07006407, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.07368717199594518, + "language_loss": 0.86109662, + "learning_rate": 0.000837456802468914, + "loss": 0.87212193, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.32470703, + "step": 1488, + "time_per_iteration": 2.5964457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109506, + "balance_loss_mlp": 1.07736683, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.0834333673185767, + "language_loss": 0.85148358, + "learning_rate": 0.0008372268509170331, + "loss": 0.86257863, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.32128906, + "step": 1489, + "time_per_iteration": 2.690129518508911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109667, + "balance_loss_mlp": 1.06500769, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.06354137393554884, + "language_loss": 0.84668255, + "learning_rate": 0.0008369967684420779, + "loss": 0.85764927, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.31640625, + "step": 1490, + "time_per_iteration": 2.71195912361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084028, + "balance_loss_mlp": 1.0523901, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.054809792311278624, + "language_loss": 0.84395373, + "learning_rate": 0.0008367665551333736, + "loss": 0.85479403, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.31616211, + "step": 1491, + "time_per_iteration": 2.604795217514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05223465, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.06594588712207736, + "language_loss": 0.85254663, + "learning_rate": 0.0008365362110802977, + "loss": 0.86338341, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.31420898, + "step": 1492, + "time_per_iteration": 2.8853299617767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086881, + "balance_loss_mlp": 1.05619645, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.057648204576232445, + "language_loss": 0.82509673, + "learning_rate": 0.0008363057363722773, + "loss": 0.83596557, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.30664062, + "step": 1493, + "time_per_iteration": 2.8410117626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088416, + "balance_loss_mlp": 1.05916238, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.06315135639172008, + "language_loss": 0.8381595, + "learning_rate": 0.0008360751310987906, + "loss": 0.84904373, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.29199219, + "step": 1494, + "time_per_iteration": 2.6032519340515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088448, + "balance_loss_mlp": 1.05821633, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.0504042487563093, + "language_loss": 0.85491359, + "learning_rate": 0.0008358443953493666, + "loss": 0.865798, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.30175781, + "step": 1495, + "time_per_iteration": 2.859473943710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095118, + "balance_loss_mlp": 1.06586444, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.05765908021852543, + "language_loss": 0.87930727, + "learning_rate": 0.0008356135292135851, + "loss": 0.89025843, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.29223633, + "step": 1496, + "time_per_iteration": 2.5534088611602783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092831, + "balance_loss_mlp": 1.06357718, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06886872222290924, + "language_loss": 0.91869086, + "learning_rate": 0.0008353825327810758, + "loss": 0.92961913, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.29223633, + "step": 1497, + "time_per_iteration": 2.4516804218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_mlp": 1.0700376, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.06787386534843613, + "language_loss": 0.81638563, + "learning_rate": 0.00083515140614152, + "loss": 0.8273809, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.29467773, + "step": 1498, + "time_per_iteration": 2.6799356937408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100433, + "balance_loss_mlp": 1.07136989, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.07094138317708479, + "language_loss": 0.861467, + "learning_rate": 0.0008349201493846485, + "loss": 0.87247133, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.2902832, + "step": 1499, + "time_per_iteration": 2.6408841609954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101106, + "balance_loss_mlp": 1.07190013, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.05864167405563355, + "language_loss": 0.88756049, + "learning_rate": 0.0008346887626002432, + "loss": 0.89857149, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.29174805, + "step": 1500, + "time_per_iteration": 2.527707099914551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102169, + "balance_loss_mlp": 1.07277215, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.05528939811548228, + "language_loss": 0.8596012, + "learning_rate": 0.000834457245878137, + "loss": 0.87062287, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.29345703, + "step": 1501, + "time_per_iteration": 2.6287105083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097625, + "balance_loss_mlp": 1.0678941, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05829487367290223, + "language_loss": 0.81370407, + "learning_rate": 0.000834225599308212, + "loss": 0.82468033, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.296875, + "step": 1502, + "time_per_iteration": 3.2405459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097665, + "balance_loss_mlp": 1.06762409, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.0632270740356206, + "language_loss": 0.85299563, + "learning_rate": 0.0008339938229804016, + "loss": 0.86397231, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.30029297, + "step": 1503, + "time_per_iteration": 2.736917495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238462, + "balance_loss_mlp": 1.22091448, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.0713987899259734, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76673281, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.17578125, + "step": 1504, + "time_per_iteration": 4.942230701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085898, + "balance_loss_mlp": 1.0553329, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06317842242163065, + "language_loss": 0.83872586, + "learning_rate": 0.0008335298814111094, + "loss": 0.84958482, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.30517578, + "step": 1505, + "time_per_iteration": 2.552032232284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082854, + "balance_loss_mlp": 1.05138254, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.05888591645587949, + "language_loss": 0.87955916, + "learning_rate": 0.0008332977163497455, + "loss": 0.89038765, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.31445312, + "step": 1506, + "time_per_iteration": 2.792531728744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080802, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.058262801056698586, + "language_loss": 0.83412617, + "learning_rate": 0.0008330654218907325, + "loss": 0.84493423, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.31616211, + "step": 1507, + "time_per_iteration": 2.67161226272583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082791, + "balance_loss_mlp": 1.05151033, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.053562219876337476, + "language_loss": 0.8135345, + "learning_rate": 0.0008328329981242548, + "loss": 0.8243624, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3125, + "step": 1508, + "time_per_iteration": 2.8886146545410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082272, + "balance_loss_mlp": 1.05006218, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.059525688681207785, + "language_loss": 0.87796283, + "learning_rate": 0.0008326004451405475, + "loss": 0.88878554, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.32202148, + "step": 1509, + "time_per_iteration": 2.7613890171051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081166, + "balance_loss_mlp": 1.04919386, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.06566805569484924, + "language_loss": 0.82636976, + "learning_rate": 0.0008323677630298957, + "loss": 0.83718145, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.31958008, + "step": 1510, + "time_per_iteration": 2.5723018646240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082627, + "balance_loss_mlp": 1.0500108, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.0587639353811087, + "language_loss": 0.84588593, + "learning_rate": 0.0008321349518826345, + "loss": 0.85671222, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.32617188, + "step": 1511, + "time_per_iteration": 2.7943453788757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085904, + "balance_loss_mlp": 1.05417013, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07149106056529789, + "language_loss": 0.94572604, + "learning_rate": 0.0008319020117891491, + "loss": 0.95658505, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.31713867, + "step": 1512, + "time_per_iteration": 2.6216046810150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083214, + "balance_loss_mlp": 1.05095613, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.062137158428294176, + "language_loss": 0.87139338, + "learning_rate": 0.0008316689428398751, + "loss": 0.88222551, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.32250977, + "step": 1513, + "time_per_iteration": 2.7016332149505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082979, + "balance_loss_mlp": 1.05217493, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.048438835392173675, + "language_loss": 0.88380623, + "learning_rate": 0.0008314357451252979, + "loss": 0.89463598, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.30761719, + "step": 1514, + "time_per_iteration": 2.7707033157348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.05329311, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.17247024929444854, + "language_loss": 0.87881547, + "learning_rate": 0.0008312024187359527, + "loss": 0.88966405, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.31542969, + "step": 1515, + "time_per_iteration": 2.6432881355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_mlp": 1.04083025, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.05532389066983382, + "language_loss": 0.86925149, + "learning_rate": 0.000830968963762425, + "loss": 0.8799662, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.3059082, + "step": 1516, + "time_per_iteration": 3.024911403656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.03955793, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.06371457252332635, + "language_loss": 0.83926201, + "learning_rate": 0.0008307353802953497, + "loss": 0.84996927, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.3112793, + "step": 1517, + "time_per_iteration": 2.6853716373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072896, + "balance_loss_mlp": 1.04202044, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04882989118503786, + "language_loss": 0.86122108, + "learning_rate": 0.0008305016684254125, + "loss": 0.87195003, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.30859375, + "step": 1518, + "time_per_iteration": 2.799062728881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077466, + "balance_loss_mlp": 1.04589891, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.06769299348115199, + "language_loss": 0.86794329, + "learning_rate": 0.0008302678282433479, + "loss": 0.87871796, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.31542969, + "step": 1519, + "time_per_iteration": 2.607813835144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079374, + "balance_loss_mlp": 1.0473547, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.06836141022194388, + "language_loss": 0.84857148, + "learning_rate": 0.0008300338598399411, + "loss": 0.85936522, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.32006836, + "step": 1520, + "time_per_iteration": 2.6339783668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079776, + "balance_loss_mlp": 1.04677844, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.07756319993269217, + "language_loss": 0.94405806, + "learning_rate": 0.0008297997633060263, + "loss": 0.9548558, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.33007812, + "step": 1521, + "time_per_iteration": 2.534118175506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072717, + "balance_loss_mlp": 1.03991103, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.05829817081366362, + "language_loss": 0.85078239, + "learning_rate": 0.0008295655387324883, + "loss": 0.86150956, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.328125, + "step": 1522, + "time_per_iteration": 2.8296775817871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072427, + "balance_loss_mlp": 1.04031241, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.07682732219120929, + "language_loss": 0.8501184, + "learning_rate": 0.0008293311862102609, + "loss": 0.8608427, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.32104492, + "step": 1523, + "time_per_iteration": 2.5440309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077196, + "balance_loss_mlp": 1.044366, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0685602534850527, + "language_loss": 0.88674849, + "learning_rate": 0.0008290967058303275, + "loss": 0.89752042, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.32836914, + "step": 1524, + "time_per_iteration": 2.47611403465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04138136, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.06274350285183052, + "language_loss": 0.86149156, + "learning_rate": 0.0008288620976837219, + "loss": 0.87222481, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.31933594, + "step": 1525, + "time_per_iteration": 2.497141122817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076595, + "balance_loss_mlp": 1.04409802, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.056882926132582716, + "language_loss": 0.82547259, + "learning_rate": 0.000828627361861527, + "loss": 0.8362385, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.32495117, + "step": 1526, + "time_per_iteration": 2.567631959915161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_mlp": 1.04157782, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.06286177552115993, + "language_loss": 0.84273493, + "learning_rate": 0.0008283924984548752, + "loss": 0.85347635, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.32568359, + "step": 1527, + "time_per_iteration": 2.8300318717956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075655, + "balance_loss_mlp": 1.04270601, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05246647038375997, + "language_loss": 0.84726572, + "learning_rate": 0.0008281575075549485, + "loss": 0.85802233, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.32958984, + "step": 1528, + "time_per_iteration": 2.574363946914673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144512, + "balance_loss_mlp": 1.12400758, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.05743835109314035, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78497207, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.20507812, + "step": 1529, + "time_per_iteration": 4.712693452835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085379, + "balance_loss_mlp": 1.05316901, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06778682509264199, + "language_loss": 0.90275097, + "learning_rate": 0.0008276871436402469, + "loss": 0.9136048, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.32202148, + "step": 1530, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098938, + "balance_loss_mlp": 1.06801534, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05712547612295055, + "language_loss": 0.87684029, + "learning_rate": 0.000827451770808083, + "loss": 0.88782966, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.30908203, + "step": 1531, + "time_per_iteration": 2.6601221561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101215, + "balance_loss_mlp": 1.06921971, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.06660356736231628, + "language_loss": 0.82939392, + "learning_rate": 0.0008272162708478674, + "loss": 0.84040606, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.31982422, + "step": 1532, + "time_per_iteration": 2.5689916610717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093792, + "balance_loss_mlp": 1.06234503, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.09954158315547566, + "language_loss": 0.86026615, + "learning_rate": 0.000826980643851029, + "loss": 0.87120402, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.31420898, + "step": 1533, + "time_per_iteration": 2.668490409851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096886, + "balance_loss_mlp": 1.06560588, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06068587162994625, + "language_loss": 0.84473491, + "learning_rate": 0.0008267448899090464, + "loss": 0.85570371, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.3125, + "step": 1534, + "time_per_iteration": 2.5667166709899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111174, + "balance_loss_mlp": 1.08053756, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.07629507960375684, + "language_loss": 0.80660546, + "learning_rate": 0.0008265090091134473, + "loss": 0.81771713, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.3059082, + "step": 1535, + "time_per_iteration": 2.8708250522613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108767, + "balance_loss_mlp": 1.07793915, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.06117244877185189, + "language_loss": 0.80140841, + "learning_rate": 0.0008262730015558088, + "loss": 0.81249607, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.30786133, + "step": 1536, + "time_per_iteration": 2.872954845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.06960511, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.058742702923310866, + "language_loss": 0.82196116, + "learning_rate": 0.0008260368673277574, + "loss": 0.8329612, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.3034668, + "step": 1537, + "time_per_iteration": 3.1321218013763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099555, + "balance_loss_mlp": 1.06963336, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.0781542924594719, + "language_loss": 0.83699298, + "learning_rate": 0.0008258006065209682, + "loss": 0.84798855, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.29882812, + "step": 1538, + "time_per_iteration": 2.7713711261749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108634, + "balance_loss_mlp": 1.0791415, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.060396297474130736, + "language_loss": 0.80198979, + "learning_rate": 0.0008255642192271657, + "loss": 0.81307614, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.29443359, + "step": 1539, + "time_per_iteration": 2.770426034927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104575, + "balance_loss_mlp": 1.07525003, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.061957869610313854, + "language_loss": 0.8370012, + "learning_rate": 0.0008253277055381241, + "loss": 0.8480469, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.29296875, + "step": 1540, + "time_per_iteration": 2.818236827850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101049, + "balance_loss_mlp": 1.07196212, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.0808235318545815, + "language_loss": 0.85973728, + "learning_rate": 0.0008250910655456658, + "loss": 0.8707478, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.29052734, + "step": 1541, + "time_per_iteration": 3.122596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097236, + "balance_loss_mlp": 1.06888783, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06915250684599016, + "language_loss": 0.83763367, + "learning_rate": 0.0008248542993416625, + "loss": 0.84860599, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.28369141, + "step": 1542, + "time_per_iteration": 2.5910961627960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093651, + "balance_loss_mlp": 1.06408739, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.05605218699384054, + "language_loss": 0.8378318, + "learning_rate": 0.0008246174070180352, + "loss": 0.84876835, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.29516602, + "step": 1543, + "time_per_iteration": 2.6633899211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.06312323, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.07006000939384768, + "language_loss": 0.83787405, + "learning_rate": 0.0008243803886667537, + "loss": 0.84879309, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.28759766, + "step": 1544, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092222, + "balance_loss_mlp": 1.0623486, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.06063612617340172, + "language_loss": 0.78866625, + "learning_rate": 0.0008241432443798364, + "loss": 0.79958844, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.2980957, + "step": 1545, + "time_per_iteration": 2.830487012863159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095453, + "balance_loss_mlp": 1.06491208, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05072672460675934, + "language_loss": 0.85210156, + "learning_rate": 0.0008239059742493512, + "loss": 0.86305606, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.30493164, + "step": 1546, + "time_per_iteration": 2.7311577796936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096869, + "balance_loss_mlp": 1.06654167, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.06216195389248957, + "language_loss": 0.87149853, + "learning_rate": 0.0008236685783674142, + "loss": 0.88246721, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.30273438, + "step": 1547, + "time_per_iteration": 3.122184991836548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195158, + "balance_loss_mlp": 1.17408168, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.0711099730375168, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77416348, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.2109375, + "step": 1548, + "time_per_iteration": 4.884527683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112807, + "balance_loss_mlp": 1.08190823, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.0721948840315393, + "language_loss": 0.82155961, + "learning_rate": 0.0008231934097178955, + "loss": 0.83268768, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.30859375, + "step": 1549, + "time_per_iteration": 2.6486034393310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099898, + "balance_loss_mlp": 1.06845081, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.06744191732210313, + "language_loss": 0.85654205, + "learning_rate": 0.0008229556371347903, + "loss": 0.86754102, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.31420898, + "step": 1550, + "time_per_iteration": 2.973072052001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_mlp": 1.06530416, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.063776129703287, + "language_loss": 0.79039407, + "learning_rate": 0.0008227177391691874, + "loss": 0.80135703, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.30957031, + "step": 1551, + "time_per_iteration": 3.121493339538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091, + "balance_loss_mlp": 1.05948138, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.06994546641795159, + "language_loss": 0.89363164, + "learning_rate": 0.0008224797159134463, + "loss": 0.90454161, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.31494141, + "step": 1552, + "time_per_iteration": 2.714345932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085005, + "balance_loss_mlp": 1.05272293, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.0687696840960861, + "language_loss": 0.83498526, + "learning_rate": 0.0008222415674599765, + "loss": 0.84583527, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.32275391, + "step": 1553, + "time_per_iteration": 3.0709471702575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108737, + "balance_loss_mlp": 1.05482578, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05942841135237563, + "language_loss": 0.83069479, + "learning_rate": 0.0008220032939012349, + "loss": 0.84156853, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.32543945, + "step": 1554, + "time_per_iteration": 2.6579041481018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084574, + "balance_loss_mlp": 1.05069458, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.05066559322117623, + "language_loss": 0.87862611, + "learning_rate": 0.0008217648953297277, + "loss": 0.88947189, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.33886719, + "step": 1555, + "time_per_iteration": 2.854501962661743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080889, + "balance_loss_mlp": 1.04836845, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06306800858294438, + "language_loss": 0.78177649, + "learning_rate": 0.0008215263718380095, + "loss": 0.79258537, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.32519531, + "step": 1556, + "time_per_iteration": 2.679813861846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_mlp": 1.03988135, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.05857921257987888, + "language_loss": 0.84453404, + "learning_rate": 0.0008212877235186833, + "loss": 0.8552593, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.32641602, + "step": 1557, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074496, + "balance_loss_mlp": 1.0575211, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03849586533955073, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812063, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.16992188, + "step": 1558, + "time_per_iteration": 4.915595531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073624, + "balance_loss_mlp": 1.04193807, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06731849387550101, + "language_loss": 0.80882478, + "learning_rate": 0.0008208100527678611, + "loss": 0.81956106, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.31665039, + "step": 1559, + "time_per_iteration": 2.584726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04162097, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.07382200765663921, + "language_loss": 0.78279877, + "learning_rate": 0.0008205710305218135, + "loss": 0.79353946, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.32446289, + "step": 1560, + "time_per_iteration": 3.0383710861206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074163, + "balance_loss_mlp": 1.04302561, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.058207727477831525, + "language_loss": 0.89512408, + "learning_rate": 0.0008203318838190541, + "loss": 0.90586567, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.31103516, + "step": 1561, + "time_per_iteration": 2.76627516746521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077695, + "balance_loss_mlp": 1.04662895, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.06168132254821995, + "language_loss": 0.85111785, + "learning_rate": 0.0008200926127524281, + "loss": 0.86189479, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.31030273, + "step": 1562, + "time_per_iteration": 2.6629600524902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077501, + "balance_loss_mlp": 1.04641104, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.05613480590592382, + "language_loss": 0.82944739, + "learning_rate": 0.0008198532174148289, + "loss": 0.84022236, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.31054688, + "step": 1563, + "time_per_iteration": 2.7358763217926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_mlp": 1.042413, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.031593282863211954, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81745368, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.16796875, + "step": 1564, + "time_per_iteration": 4.9148335456848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082495, + "balance_loss_mlp": 1.05264509, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.06408713771925002, + "language_loss": 0.88499033, + "learning_rate": 0.0008193740542985244, + "loss": 0.89581525, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.2980957, + "step": 1565, + "time_per_iteration": 2.6895992755889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.04955089, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.05458149708053591, + "language_loss": 0.86310005, + "learning_rate": 0.0008191342867058467, + "loss": 0.87388408, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.28833008, + "step": 1566, + "time_per_iteration": 2.7972991466522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.05708098, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.07332398387540356, + "language_loss": 0.8337127, + "learning_rate": 0.0008188943952142509, + "loss": 0.84458339, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.29931641, + "step": 1567, + "time_per_iteration": 2.7908260822296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090521, + "balance_loss_mlp": 1.06203008, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.06528974392408285, + "language_loss": 0.82496703, + "learning_rate": 0.0008186543799168711, + "loss": 0.83587217, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.28491211, + "step": 1568, + "time_per_iteration": 3.1478142738342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090151, + "balance_loss_mlp": 1.06170726, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.05489125757590388, + "language_loss": 0.87973905, + "learning_rate": 0.0008184142409068892, + "loss": 0.89064056, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.28466797, + "step": 1569, + "time_per_iteration": 3.0216779708862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085926, + "balance_loss_mlp": 1.05767381, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.055531787765466835, + "language_loss": 0.86334872, + "learning_rate": 0.000818173978277536, + "loss": 0.87420803, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.2824707, + "step": 1570, + "time_per_iteration": 2.679858922958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092107, + "balance_loss_mlp": 1.06378245, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.07890485552513911, + "language_loss": 0.83764422, + "learning_rate": 0.000817933592122089, + "loss": 0.84856522, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.28344727, + "step": 1571, + "time_per_iteration": 2.7156453132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097909, + "balance_loss_mlp": 1.06936991, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.06172775968750255, + "language_loss": 0.83209121, + "learning_rate": 0.0008176930825338749, + "loss": 0.84307027, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.28564453, + "step": 1572, + "time_per_iteration": 2.6125760078430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092858, + "balance_loss_mlp": 1.06474876, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.07609523017386281, + "language_loss": 0.88406599, + "learning_rate": 0.0008174524496062679, + "loss": 0.8949945, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.28100586, + "step": 1573, + "time_per_iteration": 2.9266738891601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093192, + "balance_loss_mlp": 1.06472516, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.061281594343297996, + "language_loss": 0.85176635, + "learning_rate": 0.0008172116934326894, + "loss": 0.86269826, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.28466797, + "step": 1574, + "time_per_iteration": 2.78182315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093702, + "balance_loss_mlp": 1.06499696, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.061003462460527645, + "language_loss": 0.87581599, + "learning_rate": 0.0008169708141066097, + "loss": 0.88675308, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.28686523, + "step": 1575, + "time_per_iteration": 2.579521894454956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095615, + "balance_loss_mlp": 1.06631374, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06494361929352876, + "language_loss": 0.90285015, + "learning_rate": 0.0008167298117215465, + "loss": 0.91380632, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.29272461, + "step": 1576, + "time_per_iteration": 2.576373815536499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109664, + "balance_loss_mlp": 1.06729078, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06029453435911351, + "language_loss": 0.87511861, + "learning_rate": 0.0008164886863710649, + "loss": 0.88608503, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.29296875, + "step": 1577, + "time_per_iteration": 2.913679599761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06847095, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.06219192746352704, + "language_loss": 0.86087388, + "learning_rate": 0.0008162474381487783, + "loss": 0.87184995, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.29101562, + "step": 1578, + "time_per_iteration": 3.0120038986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089575, + "balance_loss_mlp": 1.05979693, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.07133259007734825, + "language_loss": 0.84352636, + "learning_rate": 0.0008160060671483475, + "loss": 0.85442215, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.29711914, + "step": 1579, + "time_per_iteration": 2.6448450088500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087505, + "balance_loss_mlp": 1.05729711, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.06969729270721756, + "language_loss": 0.83291966, + "learning_rate": 0.0008157645734634809, + "loss": 0.8437947, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.30200195, + "step": 1580, + "time_per_iteration": 2.623994827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219684, + "balance_loss_mlp": 1.20118308, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.06785469110901753, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78116179, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.18457031, + "step": 1581, + "time_per_iteration": 4.945984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134498, + "balance_loss_mlp": 1.11723626, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.04727039603147748, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74348998, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.17285156, + "step": 1582, + "time_per_iteration": 4.907581567764282 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094198, + "balance_loss_mlp": 1.06482506, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.06103997784231323, + "language_loss": 0.83613545, + "learning_rate": 0.000815039357240067, + "loss": 0.84707743, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.29345703, + "step": 1583, + "time_per_iteration": 2.6569504737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098856, + "balance_loss_mlp": 1.07053173, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.05926881191118497, + "language_loss": 0.85445809, + "learning_rate": 0.0008147973737554952, + "loss": 0.86544669, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.28344727, + "step": 1584, + "time_per_iteration": 2.8048319816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105359, + "balance_loss_mlp": 1.07682085, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.06192456547731419, + "language_loss": 0.85451925, + "learning_rate": 0.000814555268055744, + "loss": 0.86557281, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.28540039, + "step": 1585, + "time_per_iteration": 2.6496644020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.08265996, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.06812003210241727, + "language_loss": 0.87046736, + "learning_rate": 0.0008143130402348073, + "loss": 0.88158417, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.28979492, + "step": 1586, + "time_per_iteration": 2.6643214225769043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105818, + "balance_loss_mlp": 1.07644498, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.055468457342214825, + "language_loss": 0.79345113, + "learning_rate": 0.0008140706903867265, + "loss": 0.80450928, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.29345703, + "step": 1587, + "time_per_iteration": 2.793938159942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095768, + "balance_loss_mlp": 1.06610858, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.06572122415162869, + "language_loss": 0.90151691, + "learning_rate": 0.0008138282186055897, + "loss": 0.91247463, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.29614258, + "step": 1588, + "time_per_iteration": 2.7083215713500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093617, + "balance_loss_mlp": 1.06414866, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.07456080522357873, + "language_loss": 0.82026887, + "learning_rate": 0.0008135856249855331, + "loss": 0.83120513, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.29467773, + "step": 1589, + "time_per_iteration": 2.6640753746032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05720115, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06169186885540492, + "language_loss": 0.89804673, + "learning_rate": 0.0008133429096207398, + "loss": 0.90891039, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.29125977, + "step": 1590, + "time_per_iteration": 2.7599587440490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180768, + "balance_loss_mlp": 1.16407835, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.058161185258212886, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76493025, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.16699219, + "step": 1591, + "time_per_iteration": 4.928807973861694 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092058, + "balance_loss_mlp": 1.06149244, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05378358074526122, + "language_loss": 0.86363673, + "learning_rate": 0.0008128571140339123, + "loss": 0.87455726, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.30517578, + "step": 1592, + "time_per_iteration": 2.6374073028564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093229, + "balance_loss_mlp": 1.06182945, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.059608258439458016, + "language_loss": 0.87261879, + "learning_rate": 0.0008126140340004805, + "loss": 0.88355112, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.3137207, + "step": 1593, + "time_per_iteration": 2.5177900791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106947, + "balance_loss_mlp": 1.07528496, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.05384575425533411, + "language_loss": 0.82083076, + "learning_rate": 0.0008123708325995172, + "loss": 0.83190024, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.31640625, + "step": 1594, + "time_per_iteration": 3.230646848678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106567, + "balance_loss_mlp": 1.07466626, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.05828956025392548, + "language_loss": 0.79435146, + "learning_rate": 0.0008121275099254414, + "loss": 0.80541706, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.31884766, + "step": 1595, + "time_per_iteration": 2.902198553085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100784, + "balance_loss_mlp": 1.07000458, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.0810481792888773, + "language_loss": 0.87996, + "learning_rate": 0.0008118840660727194, + "loss": 0.89096785, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.30761719, + "step": 1596, + "time_per_iteration": 2.6448442935943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086051, + "balance_loss_mlp": 1.05465174, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.06221817840069264, + "language_loss": 0.87278962, + "learning_rate": 0.0008116405011358644, + "loss": 0.88365012, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.3137207, + "step": 1597, + "time_per_iteration": 3.1513490676879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084783, + "balance_loss_mlp": 1.05455184, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05780846158028219, + "language_loss": 0.79670262, + "learning_rate": 0.0008113968152094369, + "loss": 0.80755049, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.30175781, + "step": 1598, + "time_per_iteration": 2.5093207359313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081718, + "balance_loss_mlp": 1.05160582, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.05742950260468591, + "language_loss": 0.822034, + "learning_rate": 0.0008111530083880438, + "loss": 0.83285123, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.30078125, + "step": 1599, + "time_per_iteration": 2.9002020359039307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083164, + "balance_loss_mlp": 1.05333805, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.066825138462863, + "language_loss": 0.86253393, + "learning_rate": 0.0008109090807663399, + "loss": 0.87336552, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.29760742, + "step": 1600, + "time_per_iteration": 2.8091297149658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078593, + "balance_loss_mlp": 1.04921985, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.05248494232095894, + "language_loss": 0.88362008, + "learning_rate": 0.0008106650324390257, + "loss": 0.89440602, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.29370117, + "step": 1601, + "time_per_iteration": 2.8476614952087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080904, + "balance_loss_mlp": 1.05072021, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06836714374526962, + "language_loss": 0.81128752, + "learning_rate": 0.0008104208635008493, + "loss": 0.82209659, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.30151367, + "step": 1602, + "time_per_iteration": 2.6952836513519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108624, + "balance_loss_mlp": 1.05665243, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.06376665529861299, + "language_loss": 0.81538713, + "learning_rate": 0.0008101765740466058, + "loss": 0.82624954, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.29541016, + "step": 1603, + "time_per_iteration": 2.4948389530181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080977, + "balance_loss_mlp": 1.05098414, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.06931980864978393, + "language_loss": 0.84338289, + "learning_rate": 0.0008099321641711364, + "loss": 0.85419261, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.29931641, + "step": 1604, + "time_per_iteration": 2.707308769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093892, + "balance_loss_mlp": 1.06249225, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.060864651717696075, + "language_loss": 0.83160985, + "learning_rate": 0.0008096876339693295, + "loss": 0.84254879, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.3137207, + "step": 1605, + "time_per_iteration": 2.731968402862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094701, + "balance_loss_mlp": 1.06353974, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.06509347225319946, + "language_loss": 0.8101337, + "learning_rate": 0.0008094429835361206, + "loss": 0.8210808, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.3112793, + "step": 1606, + "time_per_iteration": 2.9290759563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05914617, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.057098253953708926, + "language_loss": 0.8565855, + "learning_rate": 0.0008091982129664908, + "loss": 0.86748546, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.30810547, + "step": 1607, + "time_per_iteration": 2.698822021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087412, + "balance_loss_mlp": 1.05558348, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.06809183454795278, + "language_loss": 0.82921505, + "learning_rate": 0.0008089533223554687, + "loss": 0.8400892, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.31811523, + "step": 1608, + "time_per_iteration": 2.7226502895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108171, + "balance_loss_mlp": 1.05116844, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05457453553086006, + "language_loss": 0.85192972, + "learning_rate": 0.0008087083117981294, + "loss": 0.86274683, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.30493164, + "step": 1609, + "time_per_iteration": 2.8990776538848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079, + "balance_loss_mlp": 1.04733825, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.05682891267097286, + "language_loss": 0.87723553, + "learning_rate": 0.0008084631813895943, + "loss": 0.88802552, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.31665039, + "step": 1610, + "time_per_iteration": 2.8217973709106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077424, + "balance_loss_mlp": 1.04538095, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.06653230383850259, + "language_loss": 0.83695799, + "learning_rate": 0.0008082179312250315, + "loss": 0.84773219, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.3203125, + "step": 1611, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157933, + "balance_loss_mlp": 1.13905036, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.03907624866068961, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81013775, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18847656, + "step": 1612, + "time_per_iteration": 4.846347332000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142611, + "balance_loss_mlp": 1.12401426, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.03590336133433786, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77771938, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.18554688, + "step": 1613, + "time_per_iteration": 5.076608896255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085959, + "balance_loss_mlp": 1.05432057, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06574200684353006, + "language_loss": 0.81847739, + "learning_rate": 0.0008074814631475545, + "loss": 0.829337, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.31616211, + "step": 1614, + "time_per_iteration": 3.354888916015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086834, + "balance_loss_mlp": 1.05552983, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.058665683967318874, + "language_loss": 0.79078931, + "learning_rate": 0.0008072357349114907, + "loss": 0.80165768, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.31274414, + "step": 1615, + "time_per_iteration": 2.66959810256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085653, + "balance_loss_mlp": 1.05427742, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.07028059658598983, + "language_loss": 0.88604105, + "learning_rate": 0.0008069898873959363, + "loss": 0.89689755, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.31347656, + "step": 1616, + "time_per_iteration": 2.652873992919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081821, + "balance_loss_mlp": 1.04932451, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.0549356144381418, + "language_loss": 0.85724425, + "learning_rate": 0.0008067439206963375, + "loss": 0.86806244, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32495117, + "step": 1617, + "time_per_iteration": 2.651966094970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078025, + "balance_loss_mlp": 1.04707837, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06196009796144799, + "language_loss": 0.86023569, + "learning_rate": 0.0008064978349081873, + "loss": 0.87101597, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.30908203, + "step": 1618, + "time_per_iteration": 2.9655556678771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076741, + "balance_loss_mlp": 1.04403007, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.05286958899784421, + "language_loss": 0.86531937, + "learning_rate": 0.0008062516301270245, + "loss": 0.87608671, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.32714844, + "step": 1619, + "time_per_iteration": 2.6688730716705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077635, + "balance_loss_mlp": 1.04668832, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.04767982292239376, + "language_loss": 0.88103712, + "learning_rate": 0.0008060053064484343, + "loss": 0.89181346, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.30908203, + "step": 1620, + "time_per_iteration": 2.9296655654907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078439, + "balance_loss_mlp": 1.04794526, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.062218975842766755, + "language_loss": 0.85253787, + "learning_rate": 0.0008057588639680482, + "loss": 0.86332226, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.3046875, + "step": 1621, + "time_per_iteration": 2.7567451000213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077048, + "balance_loss_mlp": 1.04686427, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06694670244497776, + "language_loss": 0.82797694, + "learning_rate": 0.0008055123027815434, + "loss": 0.83874738, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.30151367, + "step": 1622, + "time_per_iteration": 2.9208602905273438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077079, + "balance_loss_mlp": 1.04610825, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.1782498685509151, + "language_loss": 0.84590065, + "learning_rate": 0.0008052656229846436, + "loss": 0.85667145, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.30932617, + "step": 1623, + "time_per_iteration": 2.7155866622924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073968, + "balance_loss_mlp": 1.04328322, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.060959339396114136, + "language_loss": 0.90353578, + "learning_rate": 0.0008050188246731182, + "loss": 0.91427553, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.30664062, + "step": 1624, + "time_per_iteration": 2.6797330379486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076074, + "balance_loss_mlp": 1.04412627, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.055606567643031936, + "language_loss": 0.81689882, + "learning_rate": 0.0008047719079427834, + "loss": 0.82765961, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.31933594, + "step": 1625, + "time_per_iteration": 3.0065042972564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130441, + "balance_loss_mlp": 1.11031902, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.04475298972307083, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75482148, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.20117188, + "step": 1626, + "time_per_iteration": 4.811471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079382, + "balance_loss_mlp": 1.04688525, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.07327685166102689, + "language_loss": 0.86126161, + "learning_rate": 0.0008042777196091757, + "loss": 0.87205535, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.32495117, + "step": 1627, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05241048, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.055253724304277024, + "language_loss": 0.81718934, + "learning_rate": 0.0008040304481977643, + "loss": 0.82803679, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.32324219, + "step": 1628, + "time_per_iteration": 2.655608654022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088465, + "balance_loss_mlp": 1.0556109, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.07469207399290811, + "language_loss": 0.86699098, + "learning_rate": 0.0008037830587512649, + "loss": 0.87787557, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.32861328, + "step": 1629, + "time_per_iteration": 3.092052459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108569, + "balance_loss_mlp": 1.0538609, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.05491200172004239, + "language_loss": 0.78946573, + "learning_rate": 0.0008035355513657224, + "loss": 0.80032265, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.31811523, + "step": 1630, + "time_per_iteration": 2.539320468902588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_mlp": 1.05111051, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.05139869194515267, + "language_loss": 0.92925692, + "learning_rate": 0.0008032879261372279, + "loss": 0.94008344, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.31518555, + "step": 1631, + "time_per_iteration": 2.779520034790039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_mlp": 1.05868566, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.031013784922197977, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80712551, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.18066406, + "step": 1632, + "time_per_iteration": 5.371822357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078996, + "balance_loss_mlp": 1.04828787, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.055553714952817974, + "language_loss": 0.87074977, + "learning_rate": 0.0008027923225359748, + "loss": 0.8815397, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.30688477, + "step": 1633, + "time_per_iteration": 2.6381123065948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078973, + "balance_loss_mlp": 1.04797852, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05859649155609266, + "language_loss": 0.88228178, + "learning_rate": 0.0008025443443556267, + "loss": 0.89307147, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.30957031, + "step": 1634, + "time_per_iteration": 2.7031404972076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078155, + "balance_loss_mlp": 1.04785156, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.052081770011180493, + "language_loss": 0.88152099, + "learning_rate": 0.000802296248717147, + "loss": 0.89230251, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.30273438, + "step": 1635, + "time_per_iteration": 2.9598543643951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080703, + "balance_loss_mlp": 1.05080533, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.066530556652877, + "language_loss": 0.78616363, + "learning_rate": 0.0008020480357168554, + "loss": 0.79697067, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.29833984, + "step": 1636, + "time_per_iteration": 2.797565221786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05261683, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.1046412191682548, + "language_loss": 0.87883365, + "learning_rate": 0.0008017997054511165, + "loss": 0.88965666, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.29638672, + "step": 1637, + "time_per_iteration": 2.559032440185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078208, + "balance_loss_mlp": 1.04733276, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.05513941849331592, + "language_loss": 0.85624552, + "learning_rate": 0.0008015512580163407, + "loss": 0.86702752, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.30834961, + "step": 1638, + "time_per_iteration": 2.779050827026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04363525, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.05557291013478606, + "language_loss": 0.81019449, + "learning_rate": 0.0008013026935089838, + "loss": 0.82094443, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.31323242, + "step": 1639, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04701638, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.06613944709877946, + "language_loss": 0.8358075, + "learning_rate": 0.0008010540120255472, + "loss": 0.84657711, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.29882812, + "step": 1640, + "time_per_iteration": 2.651386260986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077047, + "balance_loss_mlp": 1.0463388, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.07317243700129339, + "language_loss": 0.86339968, + "learning_rate": 0.0008008052136625774, + "loss": 0.87417012, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.30688477, + "step": 1641, + "time_per_iteration": 2.7859702110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077184, + "balance_loss_mlp": 1.04642797, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05078324108170858, + "language_loss": 0.86915755, + "learning_rate": 0.0008005562985166666, + "loss": 0.87992936, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.30712891, + "step": 1642, + "time_per_iteration": 2.770359516143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04775047, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.048579646337906, + "language_loss": 0.85256124, + "learning_rate": 0.0008003072666844524, + "loss": 0.86334682, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.30761719, + "step": 1643, + "time_per_iteration": 2.6892380714416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081754, + "balance_loss_mlp": 1.05076003, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.06943709441331726, + "language_loss": 0.82542813, + "learning_rate": 0.0008000581182626173, + "loss": 0.83624566, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.30981445, + "step": 1644, + "time_per_iteration": 2.550408124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05496669, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.05777646040930187, + "language_loss": 0.86256635, + "learning_rate": 0.0007998088533478894, + "loss": 0.87341708, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.30053711, + "step": 1645, + "time_per_iteration": 2.646522283554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081027, + "balance_loss_mlp": 1.05019915, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07748310873558778, + "language_loss": 0.84388101, + "learning_rate": 0.000799559472037042, + "loss": 0.85469127, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.30786133, + "step": 1646, + "time_per_iteration": 2.562495708465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081594, + "balance_loss_mlp": 1.05112433, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.0644603274178606, + "language_loss": 0.87469906, + "learning_rate": 0.0007993099744268932, + "loss": 0.88551497, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.30419922, + "step": 1647, + "time_per_iteration": 2.905468225479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074972, + "balance_loss_mlp": 1.04414475, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.06139744482341488, + "language_loss": 0.87846816, + "learning_rate": 0.000799060360614307, + "loss": 0.88921791, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.30786133, + "step": 1648, + "time_per_iteration": 2.6811182498931885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083311, + "balance_loss_mlp": 1.05250716, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05150264807756507, + "language_loss": 0.83281147, + "learning_rate": 0.0007988106306961917, + "loss": 0.84364462, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.30761719, + "step": 1649, + "time_per_iteration": 3.132918119430542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_mlp": 1.04840076, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.0787550229152594, + "language_loss": 0.84213352, + "learning_rate": 0.0007985607847695014, + "loss": 0.85291457, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.29663086, + "step": 1650, + "time_per_iteration": 2.690056085586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078554, + "balance_loss_mlp": 1.04784608, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.0566788479410698, + "language_loss": 0.82883936, + "learning_rate": 0.0007983108229312345, + "loss": 0.83962488, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.30664062, + "step": 1651, + "time_per_iteration": 2.918217182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.04679036, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0674507609019882, + "language_loss": 0.86496019, + "learning_rate": 0.0007980607452784351, + "loss": 0.87573761, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.30908203, + "step": 1652, + "time_per_iteration": 2.5508391857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_mlp": 1.052019, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.06063063486045483, + "language_loss": 0.90349394, + "learning_rate": 0.0007978105519081919, + "loss": 0.91431332, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.29858398, + "step": 1653, + "time_per_iteration": 2.6819515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079168, + "balance_loss_mlp": 1.04910302, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.0738675373878511, + "language_loss": 0.87538201, + "learning_rate": 0.0007975602429176385, + "loss": 0.88617373, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.30004883, + "step": 1654, + "time_per_iteration": 2.586261034011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084486, + "balance_loss_mlp": 1.05356312, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.051475836139836105, + "language_loss": 0.81585073, + "learning_rate": 0.0007973098184039536, + "loss": 0.82669556, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.30883789, + "step": 1655, + "time_per_iteration": 2.66395902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083198, + "balance_loss_mlp": 1.05291927, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.059751712008043044, + "language_loss": 0.86801946, + "learning_rate": 0.0007970592784643602, + "loss": 0.87885141, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.30224609, + "step": 1656, + "time_per_iteration": 2.9186086654663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087926, + "balance_loss_mlp": 1.05855238, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.07875703275612048, + "language_loss": 0.85285407, + "learning_rate": 0.0007968086231961272, + "loss": 0.86373335, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.29321289, + "step": 1657, + "time_per_iteration": 2.6505343914031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089245, + "balance_loss_mlp": 1.05941832, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08653253817480935, + "language_loss": 0.8381049, + "learning_rate": 0.0007965578526965671, + "loss": 0.84899735, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.29785156, + "step": 1658, + "time_per_iteration": 2.5884180068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089397, + "balance_loss_mlp": 1.05995274, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.05523051502884026, + "language_loss": 0.86312473, + "learning_rate": 0.0007963069670630377, + "loss": 0.87401861, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.29394531, + "step": 1659, + "time_per_iteration": 2.750601291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089678, + "balance_loss_mlp": 1.05997133, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.06732717892338919, + "language_loss": 0.8810066, + "learning_rate": 0.0007960559663929416, + "loss": 0.89190334, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.29663086, + "step": 1660, + "time_per_iteration": 2.6370737552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_mlp": 1.06633985, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.0532651376254825, + "language_loss": 0.87495023, + "learning_rate": 0.0007958048507837259, + "loss": 0.88591546, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.30151367, + "step": 1661, + "time_per_iteration": 2.942779779434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093392, + "balance_loss_mlp": 1.06316066, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.07710421129836972, + "language_loss": 0.87092876, + "learning_rate": 0.0007955536203328822, + "loss": 0.8818627, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.30175781, + "step": 1662, + "time_per_iteration": 2.8991520404815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100595, + "balance_loss_mlp": 1.07072091, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.05380031942726595, + "language_loss": 0.8344577, + "learning_rate": 0.0007953022751379469, + "loss": 0.84546363, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.2980957, + "step": 1663, + "time_per_iteration": 2.795117139816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102553, + "balance_loss_mlp": 1.07239294, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.0657811186180598, + "language_loss": 0.81884921, + "learning_rate": 0.000795050815296501, + "loss": 0.82987475, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.30151367, + "step": 1664, + "time_per_iteration": 2.969935894012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099283, + "balance_loss_mlp": 1.06890798, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.058736361347452894, + "language_loss": 0.93026185, + "learning_rate": 0.0007947992409061695, + "loss": 0.94125462, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.30322266, + "step": 1665, + "time_per_iteration": 2.585144281387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092624, + "balance_loss_mlp": 1.06182027, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05523611327933496, + "language_loss": 0.8654207, + "learning_rate": 0.0007945475520646226, + "loss": 0.87634689, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.30761719, + "step": 1666, + "time_per_iteration": 2.9349849224090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092223, + "balance_loss_mlp": 1.06249237, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.05521997897435197, + "language_loss": 0.84546125, + "learning_rate": 0.0007942957488695743, + "loss": 0.85638344, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.296875, + "step": 1667, + "time_per_iteration": 2.6538572311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.0539664, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.05331163349230756, + "language_loss": 0.81038171, + "learning_rate": 0.0007940438314187833, + "loss": 0.82121915, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.29760742, + "step": 1668, + "time_per_iteration": 3.009927988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108075, + "balance_loss_mlp": 1.05016077, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.06087879277496283, + "language_loss": 0.80221838, + "learning_rate": 0.0007937917998100529, + "loss": 0.81302583, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.30541992, + "step": 1669, + "time_per_iteration": 2.5703017711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072786, + "balance_loss_mlp": 1.0426501, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.07064769089672658, + "language_loss": 0.78527176, + "learning_rate": 0.0007935396541412302, + "loss": 0.79599965, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.30102539, + "step": 1670, + "time_per_iteration": 2.625499725341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081422, + "balance_loss_mlp": 1.05099988, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.0720065018777928, + "language_loss": 0.8546167, + "learning_rate": 0.0007932873945102068, + "loss": 0.86543095, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.30395508, + "step": 1671, + "time_per_iteration": 2.6188762187957764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074685, + "balance_loss_mlp": 1.05713737, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.027722134190714592, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76836461, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.17578125, + "step": 1672, + "time_per_iteration": 4.9278037548065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081072, + "balance_loss_mlp": 1.05057812, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.053011814820585035, + "language_loss": 0.86121267, + "learning_rate": 0.0007927825337533461, + "loss": 0.87202334, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.3046875, + "step": 1673, + "time_per_iteration": 2.6787123680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075926, + "balance_loss_mlp": 1.0452652, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06681709765508774, + "language_loss": 0.84770656, + "learning_rate": 0.0007925299328235131, + "loss": 0.85846579, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.30615234, + "step": 1674, + "time_per_iteration": 2.638434410095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080022, + "balance_loss_mlp": 1.04890847, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.06949369164102485, + "language_loss": 0.84795958, + "learning_rate": 0.000792277218323488, + "loss": 0.85875976, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.31103516, + "step": 1675, + "time_per_iteration": 2.5852880477905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04653537, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.06490362841252771, + "language_loss": 0.84737194, + "learning_rate": 0.0007920243903513833, + "loss": 0.85814989, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.31225586, + "step": 1676, + "time_per_iteration": 2.558058261871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083649, + "balance_loss_mlp": 1.0523684, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.0667244817356676, + "language_loss": 0.83645618, + "learning_rate": 0.0007917714490053556, + "loss": 0.84729266, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.3125, + "step": 1677, + "time_per_iteration": 2.6619315147399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.05046487, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.05833648566333407, + "language_loss": 0.85744321, + "learning_rate": 0.0007915183943836055, + "loss": 0.8682673, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.31933594, + "step": 1678, + "time_per_iteration": 2.8658525943756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04729617, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.06725353636254193, + "language_loss": 0.84315777, + "learning_rate": 0.0007912652265843773, + "loss": 0.8539505, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.31958008, + "step": 1679, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108303, + "balance_loss_mlp": 1.05019951, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.062193961969532426, + "language_loss": 0.81564045, + "learning_rate": 0.0007910119457059597, + "loss": 0.82647079, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.32836914, + "step": 1680, + "time_per_iteration": 2.6963257789611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05333161, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.0682304205879652, + "language_loss": 0.80304003, + "learning_rate": 0.0007907585518466849, + "loss": 0.81389421, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.32080078, + "step": 1681, + "time_per_iteration": 2.969540596008301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081665, + "balance_loss_mlp": 1.05026531, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.06175447283803796, + "language_loss": 0.89361274, + "learning_rate": 0.000790505045104929, + "loss": 0.90442938, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.3137207, + "step": 1682, + "time_per_iteration": 2.5148813724517822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082889, + "balance_loss_mlp": 1.05108356, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.061424377243362256, + "language_loss": 0.87097234, + "learning_rate": 0.0007902514255791125, + "loss": 0.88180125, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.31787109, + "step": 1683, + "time_per_iteration": 2.7773754596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078151, + "balance_loss_mlp": 1.04696608, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.06766194852988328, + "language_loss": 0.87911332, + "learning_rate": 0.0007899976933676986, + "loss": 0.88989484, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.31176758, + "step": 1684, + "time_per_iteration": 2.9700520038604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078008, + "balance_loss_mlp": 1.04589295, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.061649412189834635, + "language_loss": 0.87300712, + "learning_rate": 0.0007897438485691955, + "loss": 0.88378721, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.32104492, + "step": 1685, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04483223, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.06379930216662907, + "language_loss": 0.823452, + "learning_rate": 0.0007894898912821542, + "loss": 0.83422434, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.32397461, + "step": 1686, + "time_per_iteration": 2.5478906631469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071757, + "balance_loss_mlp": 1.03978539, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.05321818652056826, + "language_loss": 0.86522776, + "learning_rate": 0.0007892358216051695, + "loss": 0.87594533, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.31958008, + "step": 1687, + "time_per_iteration": 2.735633134841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075777, + "balance_loss_mlp": 1.04251742, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.0608133700269358, + "language_loss": 0.91922832, + "learning_rate": 0.0007889816396368803, + "loss": 0.92998612, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.33276367, + "step": 1688, + "time_per_iteration": 2.6234939098358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077878, + "balance_loss_mlp": 1.04497576, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.0630363811740232, + "language_loss": 0.85370868, + "learning_rate": 0.0007887273454759687, + "loss": 0.86448747, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.32910156, + "step": 1689, + "time_per_iteration": 2.4698379039764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074583, + "balance_loss_mlp": 1.04184794, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.06604183912716106, + "language_loss": 0.82445431, + "learning_rate": 0.0007884729392211603, + "loss": 0.83520007, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.32739258, + "step": 1690, + "time_per_iteration": 2.6488864421844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.04920113, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06849578130600678, + "language_loss": 0.85280114, + "learning_rate": 0.0007882184209712245, + "loss": 0.86361718, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.32397461, + "step": 1691, + "time_per_iteration": 2.5209100246429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080531, + "balance_loss_mlp": 1.04874992, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.06225581397596747, + "language_loss": 0.8573736, + "learning_rate": 0.000787963790824974, + "loss": 0.8681789, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.31762695, + "step": 1692, + "time_per_iteration": 2.9696617126464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06054115, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.0857009989212748, + "language_loss": 0.89660913, + "learning_rate": 0.0007877090488812651, + "loss": 0.90753233, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.31762695, + "step": 1693, + "time_per_iteration": 2.431861639022827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086739, + "balance_loss_mlp": 1.05553031, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.07076453254267401, + "language_loss": 0.8368417, + "learning_rate": 0.0007874541952389973, + "loss": 0.84770912, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.31176758, + "step": 1694, + "time_per_iteration": 2.647468328475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084594, + "balance_loss_mlp": 1.05293202, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.060562687008333366, + "language_loss": 0.86582285, + "learning_rate": 0.0007871992299971136, + "loss": 0.87666881, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.31640625, + "step": 1695, + "time_per_iteration": 2.553171396255493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092239, + "balance_loss_mlp": 1.0608871, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.05969457295977618, + "language_loss": 0.84301764, + "learning_rate": 0.0007869441532546001, + "loss": 0.85394001, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.31323242, + "step": 1696, + "time_per_iteration": 2.752049446105957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093733, + "balance_loss_mlp": 1.06247652, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05927141137383595, + "language_loss": 0.79686946, + "learning_rate": 0.0007866889651104867, + "loss": 0.80780673, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.31225586, + "step": 1697, + "time_per_iteration": 2.7691686153411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109533, + "balance_loss_mlp": 1.06388259, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.0715366482234757, + "language_loss": 0.83218181, + "learning_rate": 0.000786433665663846, + "loss": 0.84313512, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.31420898, + "step": 1698, + "time_per_iteration": 2.717372179031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098821, + "balance_loss_mlp": 1.06816053, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.05645489658390659, + "language_loss": 0.86431837, + "learning_rate": 0.0007861782550137942, + "loss": 0.87530661, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.30615234, + "step": 1699, + "time_per_iteration": 2.9035465717315674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103768, + "balance_loss_mlp": 1.07394195, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.11170286971508382, + "language_loss": 0.85853553, + "learning_rate": 0.0007859227332594901, + "loss": 0.86957312, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.29785156, + "step": 1700, + "time_per_iteration": 2.9302797317504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093978, + "balance_loss_mlp": 1.06508183, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.07200471053268022, + "language_loss": 0.84801477, + "learning_rate": 0.0007856671005001365, + "loss": 0.85895455, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.28881836, + "step": 1701, + "time_per_iteration": 3.1760013103485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090985, + "balance_loss_mlp": 1.06225514, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.07453437515979243, + "language_loss": 0.81870627, + "learning_rate": 0.0007854113568349787, + "loss": 0.82961613, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.28686523, + "step": 1702, + "time_per_iteration": 3.1038365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_mlp": 1.05770779, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.07528598974040544, + "language_loss": 0.80317354, + "learning_rate": 0.0007851555023633052, + "loss": 0.81405228, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.30102539, + "step": 1703, + "time_per_iteration": 2.847515106201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.0558784, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.08040178147570827, + "language_loss": 0.82301831, + "learning_rate": 0.0007848995371844474, + "loss": 0.83387053, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.29296875, + "step": 1704, + "time_per_iteration": 2.5442426204681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098029, + "balance_loss_mlp": 1.06872725, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06101842979524802, + "language_loss": 0.80441558, + "learning_rate": 0.0007846434613977801, + "loss": 0.81539583, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.29296875, + "step": 1705, + "time_per_iteration": 2.5023465156555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091561, + "balance_loss_mlp": 1.06242633, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.07007502801083235, + "language_loss": 0.78621399, + "learning_rate": 0.0007843872751027203, + "loss": 0.79712963, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.29125977, + "step": 1706, + "time_per_iteration": 2.790001392364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094895, + "balance_loss_mlp": 1.06549811, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.05836443006497643, + "language_loss": 0.87259293, + "learning_rate": 0.0007841309783987287, + "loss": 0.88354194, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.29345703, + "step": 1707, + "time_per_iteration": 2.7478153705596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097713, + "balance_loss_mlp": 1.0684588, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.05888352709782848, + "language_loss": 0.89055538, + "learning_rate": 0.0007838745713853084, + "loss": 0.90153247, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.29199219, + "step": 1708, + "time_per_iteration": 2.588653802871704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088275, + "balance_loss_mlp": 1.05925906, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.06397878577513526, + "language_loss": 0.8386358, + "learning_rate": 0.0007836180541620053, + "loss": 0.8495186, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.29003906, + "step": 1709, + "time_per_iteration": 2.7023067474365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_mlp": 1.06191421, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.05521592697878337, + "language_loss": 0.86435962, + "learning_rate": 0.0007833614268284082, + "loss": 0.87527102, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.29199219, + "step": 1710, + "time_per_iteration": 2.538080930709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090653, + "balance_loss_mlp": 1.0721513, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.029520146980468998, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75200427, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.18457031, + "step": 1711, + "time_per_iteration": 4.909448862075806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05965161, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.07803051984240059, + "language_loss": 0.78501904, + "learning_rate": 0.0007828478422289016, + "loss": 0.79591095, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.29492188, + "step": 1712, + "time_per_iteration": 2.5883195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092173, + "balance_loss_mlp": 1.06210816, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05953292046858541, + "language_loss": 0.88987601, + "learning_rate": 0.0007825908851623833, + "loss": 0.90079772, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.30004883, + "step": 1713, + "time_per_iteration": 2.7441718578338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089127, + "balance_loss_mlp": 1.05973005, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06609176393308323, + "language_loss": 0.8478905, + "learning_rate": 0.0007823338183843533, + "loss": 0.85878181, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.29394531, + "step": 1714, + "time_per_iteration": 2.6771602630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092243, + "balance_loss_mlp": 1.06291747, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.10875146541446083, + "language_loss": 0.80569458, + "learning_rate": 0.0007820766419946141, + "loss": 0.81661701, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.29321289, + "step": 1715, + "time_per_iteration": 3.3068225383758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108798, + "balance_loss_mlp": 1.07052732, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.03503617860008252, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760461, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.17480469, + "step": 1716, + "time_per_iteration": 5.048320293426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_mlp": 1.06201911, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.06576145610663801, + "language_loss": 0.76379126, + "learning_rate": 0.0007815619607794288, + "loss": 0.77470231, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.29052734, + "step": 1717, + "time_per_iteration": 2.6151187419891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094733, + "balance_loss_mlp": 1.06440604, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.08930544150493325, + "language_loss": 0.82491159, + "learning_rate": 0.0007813044561538001, + "loss": 0.835859, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.30273438, + "step": 1718, + "time_per_iteration": 3.1329195499420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089209, + "balance_loss_mlp": 1.05928707, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.06440748712139703, + "language_loss": 0.88832355, + "learning_rate": 0.0007810468423160958, + "loss": 0.8992157, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.29882812, + "step": 1719, + "time_per_iteration": 2.8785343170166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091515, + "balance_loss_mlp": 1.06195092, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.05842798757545397, + "language_loss": 0.81825691, + "learning_rate": 0.0007807891193663306, + "loss": 0.82917207, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.29492188, + "step": 1720, + "time_per_iteration": 2.775949478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.05956948, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.1056737351826848, + "language_loss": 0.82154363, + "learning_rate": 0.0007805312874045614, + "loss": 0.83243477, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.29516602, + "step": 1721, + "time_per_iteration": 2.528573513031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089843, + "balance_loss_mlp": 1.06054103, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.06879892565652022, + "language_loss": 0.86894739, + "learning_rate": 0.0007802733465308874, + "loss": 0.87984586, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.29272461, + "step": 1722, + "time_per_iteration": 2.4575133323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.05811512, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.06801648197756033, + "language_loss": 0.84311831, + "learning_rate": 0.0007800152968454501, + "loss": 0.85398912, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.28930664, + "step": 1723, + "time_per_iteration": 2.729114294052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091782, + "balance_loss_mlp": 1.06300533, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.049597969001903774, + "language_loss": 0.90648681, + "learning_rate": 0.0007797571384484334, + "loss": 0.91740465, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.28759766, + "step": 1724, + "time_per_iteration": 2.8813512325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084172, + "balance_loss_mlp": 1.05463219, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.060917196813517045, + "language_loss": 0.91917408, + "learning_rate": 0.0007794988714400633, + "loss": 0.9300158, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.29516602, + "step": 1725, + "time_per_iteration": 2.6094837188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088265, + "balance_loss_mlp": 1.05896294, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.06883363868640566, + "language_loss": 0.85331756, + "learning_rate": 0.0007792404959206079, + "loss": 0.86420023, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.29272461, + "step": 1726, + "time_per_iteration": 2.4982993602752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_mlp": 1.05396366, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.0595205364190525, + "language_loss": 0.81498575, + "learning_rate": 0.0007789820119903774, + "loss": 0.82581604, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.29052734, + "step": 1727, + "time_per_iteration": 2.9797775745391846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059791, + "balance_loss_mlp": 1.04043114, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.028746370774938412, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552454, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.19335938, + "step": 1728, + "time_per_iteration": 4.892562627792358 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090227, + "balance_loss_mlp": 1.05982828, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.10868743625457102, + "language_loss": 0.83712173, + "learning_rate": 0.0007784647192990428, + "loss": 0.84802401, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.3034668, + "step": 1729, + "time_per_iteration": 2.721163749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093021, + "balance_loss_mlp": 1.06283677, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.06834187729314575, + "language_loss": 0.80591226, + "learning_rate": 0.0007782059107387696, + "loss": 0.81684244, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.30151367, + "step": 1730, + "time_per_iteration": 2.8358583450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097893, + "balance_loss_mlp": 1.06768548, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.06518025115488765, + "language_loss": 0.88646144, + "learning_rate": 0.0007779469941693826, + "loss": 0.89744031, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.30175781, + "step": 1731, + "time_per_iteration": 2.8069489002227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105874, + "balance_loss_mlp": 1.0744741, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.0738487456517703, + "language_loss": 0.76712036, + "learning_rate": 0.0007776879696914029, + "loss": 0.77817911, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.3137207, + "step": 1732, + "time_per_iteration": 2.8068690299987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08479202, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.06155067702851775, + "language_loss": 0.88390094, + "learning_rate": 0.000777428837405392, + "loss": 0.89506716, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.31811523, + "step": 1733, + "time_per_iteration": 2.8412673473358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107208, + "balance_loss_mlp": 1.07530773, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.0682339524169846, + "language_loss": 0.86804128, + "learning_rate": 0.0007771695974119544, + "loss": 0.87911332, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.31884766, + "step": 1734, + "time_per_iteration": 2.512354612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103901, + "balance_loss_mlp": 1.07159579, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.0845052703087739, + "language_loss": 0.75201118, + "learning_rate": 0.0007769102498117359, + "loss": 0.7630502, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.32299805, + "step": 1735, + "time_per_iteration": 3.107100248336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090989, + "balance_loss_mlp": 1.05777764, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.061332510780765306, + "language_loss": 0.79977, + "learning_rate": 0.000776650794705424, + "loss": 0.81067985, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33227539, + "step": 1736, + "time_per_iteration": 3.259875535964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092848, + "balance_loss_mlp": 1.06116199, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.05236613872795896, + "language_loss": 0.82229674, + "learning_rate": 0.0007763912321937483, + "loss": 0.83322519, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.31665039, + "step": 1737, + "time_per_iteration": 2.704059600830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088373, + "balance_loss_mlp": 1.05506587, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.07890071498287932, + "language_loss": 0.82297349, + "learning_rate": 0.0007761315623774799, + "loss": 0.83385718, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33325195, + "step": 1738, + "time_per_iteration": 3.399148464202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089424, + "balance_loss_mlp": 1.0574522, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.09967891290955513, + "language_loss": 0.87632757, + "learning_rate": 0.0007758717853574313, + "loss": 0.88722181, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.31958008, + "step": 1739, + "time_per_iteration": 2.772089958190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103829, + "balance_loss_mlp": 1.0729773, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06672668023604937, + "language_loss": 0.90074134, + "learning_rate": 0.0007756119012344571, + "loss": 0.91177964, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.30810547, + "step": 1740, + "time_per_iteration": 2.5482232570648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108279, + "balance_loss_mlp": 1.07707, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.07840140242610649, + "language_loss": 0.84438574, + "learning_rate": 0.0007753519101094535, + "loss": 0.85546857, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.31176758, + "step": 1741, + "time_per_iteration": 2.749004602432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102131, + "balance_loss_mlp": 1.07173228, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.07002932741488781, + "language_loss": 0.86241812, + "learning_rate": 0.0007750918120833575, + "loss": 0.87343943, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.3034668, + "step": 1742, + "time_per_iteration": 2.600731611251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110577, + "balance_loss_mlp": 1.0753479, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.07258867640739639, + "language_loss": 0.87368989, + "learning_rate": 0.0007748316072571485, + "loss": 0.88474762, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.30395508, + "step": 1743, + "time_per_iteration": 2.7698371410369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109732, + "balance_loss_mlp": 1.07902408, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.05763877458348602, + "language_loss": 0.79041934, + "learning_rate": 0.0007745712957318467, + "loss": 0.80151671, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.30664062, + "step": 1744, + "time_per_iteration": 2.967310667037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104284, + "balance_loss_mlp": 1.07412386, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.052786515694630796, + "language_loss": 0.86410165, + "learning_rate": 0.0007743108776085141, + "loss": 0.87514448, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.30102539, + "step": 1745, + "time_per_iteration": 2.771803855895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102511, + "balance_loss_mlp": 1.07049131, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.06089020802257528, + "language_loss": 0.82798052, + "learning_rate": 0.0007740503529882543, + "loss": 0.83900565, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.32006836, + "step": 1746, + "time_per_iteration": 2.805392026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095402, + "balance_loss_mlp": 1.064551, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.0569869068698716, + "language_loss": 0.90718448, + "learning_rate": 0.0007737897219722114, + "loss": 0.9181385, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.30810547, + "step": 1747, + "time_per_iteration": 2.699065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.05970204, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.07943976371979472, + "language_loss": 0.80688596, + "learning_rate": 0.0007735289846615716, + "loss": 0.81779456, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.31152344, + "step": 1748, + "time_per_iteration": 2.6637260913848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094297, + "balance_loss_mlp": 1.06356478, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.06884386609789231, + "language_loss": 0.81979561, + "learning_rate": 0.0007732681411575621, + "loss": 0.83073854, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.30712891, + "step": 1749, + "time_per_iteration": 2.673060417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087083, + "balance_loss_mlp": 1.0555166, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.052237930998467595, + "language_loss": 0.87234819, + "learning_rate": 0.0007730071915614514, + "loss": 0.88321906, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.31542969, + "step": 1750, + "time_per_iteration": 2.707857370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089836, + "balance_loss_mlp": 1.05896115, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.08336153438972979, + "language_loss": 0.88963622, + "learning_rate": 0.0007727461359745489, + "loss": 0.90053463, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.30859375, + "step": 1751, + "time_per_iteration": 2.482837438583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093668, + "balance_loss_mlp": 1.06307864, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05330176149069141, + "language_loss": 0.86016554, + "learning_rate": 0.0007724849744982056, + "loss": 0.87110221, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.30541992, + "step": 1752, + "time_per_iteration": 2.690420389175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097033, + "balance_loss_mlp": 1.06668198, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.0643678921459399, + "language_loss": 0.81981385, + "learning_rate": 0.0007722237072338131, + "loss": 0.8307842, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.30322266, + "step": 1753, + "time_per_iteration": 2.7154347896575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097395, + "balance_loss_mlp": 1.06694901, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.07107791288081117, + "language_loss": 0.85213387, + "learning_rate": 0.0007719623342828046, + "loss": 0.8631078, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.30419922, + "step": 1754, + "time_per_iteration": 2.5009355545043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109586, + "balance_loss_mlp": 1.06426978, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.06326183968549627, + "language_loss": 0.84134084, + "learning_rate": 0.000771700855746654, + "loss": 0.85229945, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.31567383, + "step": 1755, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082281, + "balance_loss_mlp": 1.05071473, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.06130822269954804, + "language_loss": 0.88395244, + "learning_rate": 0.0007714392717268763, + "loss": 0.89477527, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.31542969, + "step": 1756, + "time_per_iteration": 2.6147336959838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083496, + "balance_loss_mlp": 1.05219221, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.05731341996908033, + "language_loss": 0.86388242, + "learning_rate": 0.0007711775823250273, + "loss": 0.87471741, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.31298828, + "step": 1757, + "time_per_iteration": 2.5304934978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085861, + "balance_loss_mlp": 1.05455685, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.061357664780502266, + "language_loss": 0.83481395, + "learning_rate": 0.0007709157876427039, + "loss": 0.84567261, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.31274414, + "step": 1758, + "time_per_iteration": 3.1116981506347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074204, + "balance_loss_mlp": 1.04189849, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0592835704233285, + "language_loss": 0.85574573, + "learning_rate": 0.0007706538877815439, + "loss": 0.86648774, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.32299805, + "step": 1759, + "time_per_iteration": 2.635298728942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077747, + "balance_loss_mlp": 1.04730105, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.04672826561746397, + "language_loss": 0.83449262, + "learning_rate": 0.0007703918828432259, + "loss": 0.84527004, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.30419922, + "step": 1760, + "time_per_iteration": 2.664783477783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071091, + "balance_loss_mlp": 1.04023945, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.061026274734732225, + "language_loss": 0.88914752, + "learning_rate": 0.000770129772929469, + "loss": 0.89985847, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.30810547, + "step": 1761, + "time_per_iteration": 2.7082738876342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_mlp": 1.03914273, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.058866792995701266, + "language_loss": 0.88234216, + "learning_rate": 0.0007698675581420334, + "loss": 0.89304519, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.3112793, + "step": 1762, + "time_per_iteration": 2.9119746685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107099, + "balance_loss_mlp": 1.03966177, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.06738514708484569, + "language_loss": 0.78819811, + "learning_rate": 0.0007696052385827199, + "loss": 0.79890805, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.31298828, + "step": 1763, + "time_per_iteration": 2.9451980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107403, + "balance_loss_mlp": 1.04172421, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.0719800357998311, + "language_loss": 0.78192145, + "learning_rate": 0.00076934281435337, + "loss": 0.79266179, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.32299805, + "step": 1764, + "time_per_iteration": 2.8267600536346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071904, + "balance_loss_mlp": 1.03931201, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.06414673033674093, + "language_loss": 0.85701221, + "learning_rate": 0.0007690802855558658, + "loss": 0.86773127, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.32592773, + "step": 1765, + "time_per_iteration": 2.8825321197509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060485, + "balance_loss_mlp": 1.04322386, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.027152559638010845, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.7743544, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.17285156, + "step": 1766, + "time_per_iteration": 4.890359401702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078911, + "balance_loss_mlp": 1.04684353, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.06170687350837257, + "language_loss": 0.89089799, + "learning_rate": 0.0007685549146641262, + "loss": 0.90168703, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.32055664, + "step": 1767, + "time_per_iteration": 2.539238691329956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077091, + "balance_loss_mlp": 1.04557216, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05571629344022593, + "language_loss": 0.8822673, + "learning_rate": 0.0007682920727738579, + "loss": 0.89303821, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.31494141, + "step": 1768, + "time_per_iteration": 2.512801170349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080619, + "balance_loss_mlp": 1.04931498, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06175400371418068, + "language_loss": 0.8474735, + "learning_rate": 0.000768029126723369, + "loss": 0.85827971, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.31274414, + "step": 1769, + "time_per_iteration": 2.5238869190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075433, + "balance_loss_mlp": 1.04515338, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.06596681609056877, + "language_loss": 0.81544566, + "learning_rate": 0.0007677660766147447, + "loss": 0.82620001, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.30224609, + "step": 1770, + "time_per_iteration": 2.5212690830230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_mlp": 1.02063394, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.014856007486746849, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73508459, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.16894531, + "step": 1771, + "time_per_iteration": 4.967731475830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_mlp": 1.05113387, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.075322249241395, + "language_loss": 0.79792535, + "learning_rate": 0.0007672396646316306, + "loss": 0.8087405, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.30322266, + "step": 1772, + "time_per_iteration": 2.524365186691284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084918, + "balance_loss_mlp": 1.05451918, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.05910937608565349, + "language_loss": 0.80291271, + "learning_rate": 0.000766976302961512, + "loss": 0.81376183, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.30371094, + "step": 1773, + "time_per_iteration": 3.002929925918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086798, + "balance_loss_mlp": 1.0563519, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.0625889066862488, + "language_loss": 0.81081951, + "learning_rate": 0.0007667128376420003, + "loss": 0.82168746, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.30395508, + "step": 1774, + "time_per_iteration": 2.5821964740753174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083599, + "balance_loss_mlp": 1.05336761, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.06267075227744807, + "language_loss": 0.84329379, + "learning_rate": 0.0007664492687753817, + "loss": 0.85412979, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.30175781, + "step": 1775, + "time_per_iteration": 2.7457377910614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077211, + "balance_loss_mlp": 1.04769528, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.054581176728495925, + "language_loss": 0.81518859, + "learning_rate": 0.000766185596463983, + "loss": 0.8259607, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.29516602, + "step": 1776, + "time_per_iteration": 2.655543804168701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078997, + "balance_loss_mlp": 1.04993343, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.06969464274274284, + "language_loss": 0.76725864, + "learning_rate": 0.0007659218208101706, + "loss": 0.77804863, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.29003906, + "step": 1777, + "time_per_iteration": 3.1378567218780518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06411862, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.0529989301900612, + "language_loss": 0.84699291, + "learning_rate": 0.0007656579419163515, + "loss": 0.85792446, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.29052734, + "step": 1778, + "time_per_iteration": 2.8120994567871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091459, + "balance_loss_mlp": 1.06239629, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.06282493199141514, + "language_loss": 0.76994503, + "learning_rate": 0.0007653939598849724, + "loss": 0.78085959, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.2902832, + "step": 1779, + "time_per_iteration": 2.5995492935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.07051396, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.04507156484415478, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83967406, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.16699219, + "step": 1780, + "time_per_iteration": 4.9175097942352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102047, + "balance_loss_mlp": 1.07186341, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.05745476314946865, + "language_loss": 0.79740059, + "learning_rate": 0.000764865686819522, + "loss": 0.80842102, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.30151367, + "step": 1781, + "time_per_iteration": 3.1022064685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.06907511, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.061017866945560745, + "language_loss": 0.85627258, + "learning_rate": 0.0007646013959905449, + "loss": 0.8672511, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.28759766, + "step": 1782, + "time_per_iteration": 2.625312566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090603, + "balance_loss_mlp": 1.06030035, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05493462983431466, + "language_loss": 0.80768538, + "learning_rate": 0.0007643370024341949, + "loss": 0.81859136, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.30249023, + "step": 1783, + "time_per_iteration": 3.1206953525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092054, + "balance_loss_mlp": 1.06284761, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.04934338548004703, + "language_loss": 0.8289808, + "learning_rate": 0.0007640725062531195, + "loss": 0.83990133, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.29174805, + "step": 1784, + "time_per_iteration": 2.518277645111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092006, + "balance_loss_mlp": 1.06165504, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.061838155255473454, + "language_loss": 0.8616311, + "learning_rate": 0.0007638079075500047, + "loss": 0.8725512, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.30297852, + "step": 1785, + "time_per_iteration": 2.566340684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056366, + "balance_loss_mlp": 1.04101145, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.03141321768780463, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76237035, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.15332031, + "step": 1786, + "time_per_iteration": 4.984891891479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_mlp": 1.05088782, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.0502662811310507, + "language_loss": 0.83153242, + "learning_rate": 0.0007632784029886026, + "loss": 0.84235144, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.30981445, + "step": 1787, + "time_per_iteration": 2.6574935913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078963, + "balance_loss_mlp": 1.04832625, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.058652751735253, + "language_loss": 0.85391539, + "learning_rate": 0.0007630134973358873, + "loss": 0.86470503, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.3059082, + "step": 1788, + "time_per_iteration": 2.920311450958252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088088, + "balance_loss_mlp": 1.05702209, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05633660644162356, + "language_loss": 0.86888337, + "learning_rate": 0.0007627484895722763, + "loss": 0.87976426, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.31030273, + "step": 1789, + "time_per_iteration": 2.648061513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.05268025, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.08125120447961011, + "language_loss": 0.79987907, + "learning_rate": 0.0007624833798006552, + "loss": 0.8107022, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.29614258, + "step": 1790, + "time_per_iteration": 3.083303689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082489, + "balance_loss_mlp": 1.05249596, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.06337905919609309, + "language_loss": 0.83924425, + "learning_rate": 0.0007622181681239483, + "loss": 0.85006905, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.29931641, + "step": 1791, + "time_per_iteration": 2.6613235473632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078031, + "balance_loss_mlp": 1.04677427, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.05139164694864183, + "language_loss": 0.84563744, + "learning_rate": 0.0007619528546451202, + "loss": 0.85641772, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.31225586, + "step": 1792, + "time_per_iteration": 2.7847092151641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082474, + "balance_loss_mlp": 1.05183685, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.060391852587241154, + "language_loss": 0.8357141, + "learning_rate": 0.0007616874394671745, + "loss": 0.84653878, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.3059082, + "step": 1793, + "time_per_iteration": 3.3427343368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05632687, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.07229882199780847, + "language_loss": 0.85033429, + "learning_rate": 0.0007614219226931547, + "loss": 0.86121154, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.3137207, + "step": 1794, + "time_per_iteration": 2.6797611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090025, + "balance_loss_mlp": 1.05931664, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.057715322830613675, + "language_loss": 0.84206641, + "learning_rate": 0.0007611563044261435, + "loss": 0.85296667, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.30664062, + "step": 1795, + "time_per_iteration": 2.5259828567504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086711, + "balance_loss_mlp": 1.05543017, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.06328741897936851, + "language_loss": 0.86560625, + "learning_rate": 0.0007608905847692631, + "loss": 0.87647337, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.3125, + "step": 1796, + "time_per_iteration": 2.472182035446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081946, + "balance_loss_mlp": 1.05014098, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.053847624873276365, + "language_loss": 0.86582637, + "learning_rate": 0.0007606247638256749, + "loss": 0.8766458, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.31787109, + "step": 1797, + "time_per_iteration": 2.842547655105591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147955, + "balance_loss_mlp": 1.13145602, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.06482996241123744, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79318249, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.16503906, + "step": 1798, + "time_per_iteration": 4.918993949890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075567, + "balance_loss_mlp": 1.06011796, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.04230684388330953, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80402768, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.15429688, + "step": 1799, + "time_per_iteration": 4.791706323623657 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077617, + "balance_loss_mlp": 1.04724216, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.06124115711212235, + "language_loss": 0.85762143, + "learning_rate": 0.0007598266943068686, + "loss": 0.86839759, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.30322266, + "step": 1800, + "time_per_iteration": 2.743213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083873, + "balance_loss_mlp": 1.05266404, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.13184352245004016, + "language_loss": 0.83900499, + "learning_rate": 0.0007595604692488507, + "loss": 0.84984374, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31176758, + "step": 1801, + "time_per_iteration": 2.542065382003784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082571, + "balance_loss_mlp": 1.05105186, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.0617697315453188, + "language_loss": 0.82875979, + "learning_rate": 0.0007592941434205215, + "loss": 0.83958554, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.31494141, + "step": 1802, + "time_per_iteration": 2.803941488265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077417, + "balance_loss_mlp": 1.06292093, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.03209988868756776, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74648476, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.14453125, + "step": 1803, + "time_per_iteration": 5.115894794464111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073735, + "balance_loss_mlp": 1.04176331, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.057797440709038125, + "language_loss": 0.7980904, + "learning_rate": 0.0007587611898665566, + "loss": 0.80882776, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.31958008, + "step": 1804, + "time_per_iteration": 3.0783464908599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_mlp": 1.04958522, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.052922401600576395, + "language_loss": 0.8228178, + "learning_rate": 0.0007584945623478315, + "loss": 0.83362216, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.30810547, + "step": 1805, + "time_per_iteration": 2.8341996669769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107388, + "balance_loss_mlp": 1.04178858, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.05986711270473425, + "language_loss": 0.81165981, + "learning_rate": 0.000758227834472617, + "loss": 0.82239866, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32080078, + "step": 1806, + "time_per_iteration": 3.0486085414886475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.04971278, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.06433807190471491, + "language_loss": 0.77163357, + "learning_rate": 0.0007579610063444664, + "loss": 0.78245926, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.32861328, + "step": 1807, + "time_per_iteration": 2.7597365379333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073013, + "balance_loss_mlp": 1.04068375, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.06573509148212295, + "language_loss": 0.8740322, + "learning_rate": 0.0007576940780669712, + "loss": 0.88476229, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32324219, + "step": 1808, + "time_per_iteration": 3.2193737030029297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.04060304, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.07068655640298144, + "language_loss": 0.84018815, + "learning_rate": 0.0007574270497437624, + "loss": 0.85092652, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33251953, + "step": 1809, + "time_per_iteration": 2.958071708679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074122, + "balance_loss_mlp": 1.04255509, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.05267537563651592, + "language_loss": 0.88190216, + "learning_rate": 0.000757159921478509, + "loss": 0.89264333, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.31542969, + "step": 1810, + "time_per_iteration": 2.743820905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011251, + "balance_loss_mlp": 1.10993648, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.032772528197798495, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75575733, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.15136719, + "step": 1811, + "time_per_iteration": 4.734825372695923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_mlp": 1.04713607, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.06138203683055377, + "language_loss": 0.87334222, + "learning_rate": 0.0007566253655367423, + "loss": 0.88411689, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.30273438, + "step": 1812, + "time_per_iteration": 2.5963358879089355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.04946637, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.05073723218815133, + "language_loss": 0.89626348, + "learning_rate": 0.000756357938067762, + "loss": 0.90707672, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.31835938, + "step": 1813, + "time_per_iteration": 2.6791560649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088512, + "balance_loss_mlp": 1.05615854, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.07107132576327291, + "language_loss": 0.82739902, + "learning_rate": 0.0007560904110718033, + "loss": 0.83828408, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32324219, + "step": 1814, + "time_per_iteration": 3.251187801361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084533, + "balance_loss_mlp": 1.05244136, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.056660731031110724, + "language_loss": 0.83390886, + "learning_rate": 0.0007558227846527297, + "loss": 0.84475422, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.32080078, + "step": 1815, + "time_per_iteration": 2.852786064147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086181, + "balance_loss_mlp": 1.05358887, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.06752757018776132, + "language_loss": 0.83192128, + "learning_rate": 0.0007555550589144429, + "loss": 0.84278309, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.32592773, + "step": 1816, + "time_per_iteration": 2.4226694107055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108673, + "balance_loss_mlp": 1.05568814, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.05637535729014081, + "language_loss": 0.84440207, + "learning_rate": 0.000755287233960883, + "loss": 0.85526937, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.31005859, + "step": 1817, + "time_per_iteration": 2.556528329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081988, + "balance_loss_mlp": 1.04963493, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06861190177202381, + "language_loss": 0.77555025, + "learning_rate": 0.0007550193098960292, + "loss": 0.7863701, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32348633, + "step": 1818, + "time_per_iteration": 2.9168636798858643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081065, + "balance_loss_mlp": 1.04902124, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.04890635253674866, + "language_loss": 0.85897982, + "learning_rate": 0.0007547512868238988, + "loss": 0.86979043, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.3203125, + "step": 1819, + "time_per_iteration": 3.147949695587158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086999, + "balance_loss_mlp": 1.05583739, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.07359678742691168, + "language_loss": 0.83527619, + "learning_rate": 0.0007544831648485473, + "loss": 0.84614623, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.3112793, + "step": 1820, + "time_per_iteration": 2.683906078338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084002, + "balance_loss_mlp": 1.05272126, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.07119738396785501, + "language_loss": 0.81087327, + "learning_rate": 0.0007542149440740694, + "loss": 0.82171333, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.3125, + "step": 1821, + "time_per_iteration": 2.738029718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107983, + "balance_loss_mlp": 1.04850197, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.07229829340096756, + "language_loss": 0.8569001, + "learning_rate": 0.000753946624604597, + "loss": 0.86769843, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.31298828, + "step": 1822, + "time_per_iteration": 2.7263731956481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079169, + "balance_loss_mlp": 1.04795969, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.05660966900473529, + "language_loss": 0.87968546, + "learning_rate": 0.0007536782065443015, + "loss": 0.89047718, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.31176758, + "step": 1823, + "time_per_iteration": 2.64158034324646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108426, + "balance_loss_mlp": 1.05386138, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06227259781784348, + "language_loss": 0.74483079, + "learning_rate": 0.0007534096899973919, + "loss": 0.75567335, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.3034668, + "step": 1824, + "time_per_iteration": 2.609548807144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_mlp": 1.04804349, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05520550621954613, + "language_loss": 0.82636261, + "learning_rate": 0.0007531410750681154, + "loss": 0.83715534, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.31201172, + "step": 1825, + "time_per_iteration": 2.7306325435638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094474, + "balance_loss_mlp": 1.06352782, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.04890512262044313, + "language_loss": 0.86351258, + "learning_rate": 0.0007528723618607575, + "loss": 0.8744573, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.30908203, + "step": 1826, + "time_per_iteration": 3.4343338012695312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088582, + "balance_loss_mlp": 1.05782557, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.05382597898667073, + "language_loss": 0.82364488, + "learning_rate": 0.0007526035504796422, + "loss": 0.83453071, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.30737305, + "step": 1827, + "time_per_iteration": 2.7783889770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088781, + "balance_loss_mlp": 1.05721426, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.07196751046410012, + "language_loss": 0.86701363, + "learning_rate": 0.0007523346410291312, + "loss": 0.87790149, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.31542969, + "step": 1828, + "time_per_iteration": 2.7925262451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096578, + "balance_loss_mlp": 1.06434393, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.05953464089235074, + "language_loss": 0.84491026, + "learning_rate": 0.0007520656336136245, + "loss": 0.85587609, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32226562, + "step": 1829, + "time_per_iteration": 2.9498770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095972, + "balance_loss_mlp": 1.0648104, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.05500553487662277, + "language_loss": 0.87983966, + "learning_rate": 0.0007517965283375599, + "loss": 0.89079928, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.3112793, + "step": 1830, + "time_per_iteration": 2.838120698928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097926, + "balance_loss_mlp": 1.06566763, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.053691241766720514, + "language_loss": 0.89336729, + "learning_rate": 0.0007515273253054132, + "loss": 0.90434659, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32250977, + "step": 1831, + "time_per_iteration": 2.6600866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092956, + "balance_loss_mlp": 1.06191444, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.05928754583625919, + "language_loss": 0.82674569, + "learning_rate": 0.0007512580246216988, + "loss": 0.83767527, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.31005859, + "step": 1832, + "time_per_iteration": 2.7806639671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089641, + "balance_loss_mlp": 1.05752611, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.0631616677310412, + "language_loss": 0.84810489, + "learning_rate": 0.000750988626390968, + "loss": 0.85900134, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32104492, + "step": 1833, + "time_per_iteration": 2.5985615253448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087885, + "balance_loss_mlp": 1.0560801, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.053730319302775706, + "language_loss": 0.84857321, + "learning_rate": 0.0007507191307178108, + "loss": 0.85945207, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.31787109, + "step": 1834, + "time_per_iteration": 2.822472095489502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05785227, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.07238185360826516, + "language_loss": 0.74172056, + "learning_rate": 0.0007504495377068543, + "loss": 0.75260878, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.30932617, + "step": 1835, + "time_per_iteration": 2.758622884750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094119, + "balance_loss_mlp": 1.06250441, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06860617015764896, + "language_loss": 0.81217551, + "learning_rate": 0.0007501798474627642, + "loss": 0.82311678, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.31591797, + "step": 1836, + "time_per_iteration": 2.932610034942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095464, + "balance_loss_mlp": 1.06568563, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.06442397939494823, + "language_loss": 0.83527768, + "learning_rate": 0.0007499100600902433, + "loss": 0.8462323, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.29736328, + "step": 1837, + "time_per_iteration": 3.0089991092681885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089306, + "balance_loss_mlp": 1.05845428, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06893251529793973, + "language_loss": 0.83798671, + "learning_rate": 0.0007496401756940324, + "loss": 0.84887969, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.30810547, + "step": 1838, + "time_per_iteration": 2.6746418476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090814, + "balance_loss_mlp": 1.06029606, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.06403380726847299, + "language_loss": 0.82561135, + "learning_rate": 0.0007493701943789098, + "loss": 0.83651948, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.3046875, + "step": 1839, + "time_per_iteration": 2.7678062915802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092399, + "balance_loss_mlp": 1.06307316, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.057234368489623245, + "language_loss": 0.82641804, + "learning_rate": 0.000749100116249692, + "loss": 0.83734202, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.29272461, + "step": 1840, + "time_per_iteration": 2.6124982833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091953, + "balance_loss_mlp": 1.0616498, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.09225915028059628, + "language_loss": 0.86273944, + "learning_rate": 0.0007488299414112321, + "loss": 0.87365901, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.30249023, + "step": 1841, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087223, + "balance_loss_mlp": 1.05737281, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.0557731038759208, + "language_loss": 0.77796137, + "learning_rate": 0.0007485596699684215, + "loss": 0.78883362, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.2980957, + "step": 1842, + "time_per_iteration": 2.83414626121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087281, + "balance_loss_mlp": 1.05561948, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.04938820360777142, + "language_loss": 0.85113978, + "learning_rate": 0.000748289302026189, + "loss": 0.86201257, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.31640625, + "step": 1843, + "time_per_iteration": 2.8805251121520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084484, + "balance_loss_mlp": 1.05403841, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06499404847276229, + "language_loss": 0.85830677, + "learning_rate": 0.0007480188376895004, + "loss": 0.86915159, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.30395508, + "step": 1844, + "time_per_iteration": 3.0965142250061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_mlp": 1.04624832, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.026974392702602535, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74874085, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.16503906, + "step": 1845, + "time_per_iteration": 5.003226280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.05738342, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.11496133406812095, + "language_loss": 0.78570682, + "learning_rate": 0.0007474776202528074, + "loss": 0.79659295, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.31201172, + "step": 1846, + "time_per_iteration": 2.9579098224639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089072, + "balance_loss_mlp": 1.05736208, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.06294098896241457, + "language_loss": 0.81369591, + "learning_rate": 0.000747206867362922, + "loss": 0.82458663, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.31689453, + "step": 1847, + "time_per_iteration": 3.0886905193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109789, + "balance_loss_mlp": 1.06656218, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.060378794046525276, + "language_loss": 0.83593512, + "learning_rate": 0.0007469360184988194, + "loss": 0.84691405, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.31298828, + "step": 1848, + "time_per_iteration": 2.861438512802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109845, + "balance_loss_mlp": 1.06724131, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.06250375704468988, + "language_loss": 0.86663848, + "learning_rate": 0.0007466650737656518, + "loss": 0.87762296, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.31176758, + "step": 1849, + "time_per_iteration": 2.620384454727173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098996, + "balance_loss_mlp": 1.06754851, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05619364173691644, + "language_loss": 0.90150386, + "learning_rate": 0.0007463940332686098, + "loss": 0.91249382, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.31420898, + "step": 1850, + "time_per_iteration": 2.499337911605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097613, + "balance_loss_mlp": 1.06711888, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.05220134930851383, + "language_loss": 0.8454684, + "learning_rate": 0.0007461228971129205, + "loss": 0.85644454, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.30444336, + "step": 1851, + "time_per_iteration": 2.91583251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090798, + "balance_loss_mlp": 1.06049538, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.06507053577711389, + "language_loss": 0.85374135, + "learning_rate": 0.0007458516654038483, + "loss": 0.8646493, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.30297852, + "step": 1852, + "time_per_iteration": 2.710845947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093158, + "balance_loss_mlp": 1.06221175, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.055267605083424515, + "language_loss": 0.86826843, + "learning_rate": 0.0007455803382466946, + "loss": 0.87919998, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.30908203, + "step": 1853, + "time_per_iteration": 2.8157601356506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089896, + "balance_loss_mlp": 1.05894923, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.06143674576014299, + "language_loss": 0.87150055, + "learning_rate": 0.0007453089157467979, + "loss": 0.8823995, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.30908203, + "step": 1854, + "time_per_iteration": 2.7985024452209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101035, + "balance_loss_mlp": 1.06946826, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.06203911404438901, + "language_loss": 0.82222199, + "learning_rate": 0.0007450373980095341, + "loss": 0.83323234, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.31542969, + "step": 1855, + "time_per_iteration": 3.0960283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101415, + "balance_loss_mlp": 1.07108843, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.05169641299516589, + "language_loss": 0.86845142, + "learning_rate": 0.0007447657851403155, + "loss": 0.87946558, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.30322266, + "step": 1856, + "time_per_iteration": 2.6420810222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106839, + "balance_loss_mlp": 1.07689333, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.07027910399075639, + "language_loss": 0.78771162, + "learning_rate": 0.0007444940772445915, + "loss": 0.79878008, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.29907227, + "step": 1857, + "time_per_iteration": 2.748770236968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109389, + "balance_loss_mlp": 1.06420684, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.057407361829253975, + "language_loss": 0.80228555, + "learning_rate": 0.0007442222744278484, + "loss": 0.81322443, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.29663086, + "step": 1858, + "time_per_iteration": 2.652111530303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094475, + "balance_loss_mlp": 1.06410074, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.045384089682170406, + "language_loss": 0.8399753, + "learning_rate": 0.0007439503767956099, + "loss": 0.85092002, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.30371094, + "step": 1859, + "time_per_iteration": 2.703261375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_mlp": 1.03111064, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.02493030642290896, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80715972, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.1328125, + "step": 1860, + "time_per_iteration": 4.983760833740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092897, + "balance_loss_mlp": 1.06242704, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.05045998946960442, + "language_loss": 0.85959804, + "learning_rate": 0.000743406297506922, + "loss": 0.87052703, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.30419922, + "step": 1861, + "time_per_iteration": 2.740078926086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090008, + "balance_loss_mlp": 1.05956221, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.05968554082553822, + "language_loss": 0.8392486, + "learning_rate": 0.0007431341160617031, + "loss": 0.85014868, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.30395508, + "step": 1862, + "time_per_iteration": 2.8886373043060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076671, + "balance_loss_mlp": 1.04631984, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.053643840261235066, + "language_loss": 0.88015211, + "learning_rate": 0.0007428618402234491, + "loss": 0.89091879, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.30297852, + "step": 1863, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04334283, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.062332671108041963, + "language_loss": 0.80358481, + "learning_rate": 0.0007425894700978668, + "loss": 0.81432676, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.30810547, + "step": 1864, + "time_per_iteration": 2.7334656715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072556, + "balance_loss_mlp": 1.04101336, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.050645747658019255, + "language_loss": 0.79510379, + "learning_rate": 0.0007423170057906996, + "loss": 0.80582935, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.31542969, + "step": 1865, + "time_per_iteration": 3.8669073581695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076041, + "balance_loss_mlp": 1.04452205, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06345597879427126, + "language_loss": 0.86289865, + "learning_rate": 0.0007420444474077275, + "loss": 0.87365907, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.31518555, + "step": 1866, + "time_per_iteration": 2.5648367404937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080689, + "balance_loss_mlp": 1.04878831, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.058480526362169126, + "language_loss": 0.89744091, + "learning_rate": 0.0007417717950547671, + "loss": 0.90824777, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.31884766, + "step": 1867, + "time_per_iteration": 2.5665245056152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074714, + "balance_loss_mlp": 1.0600276, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.04131149216661822, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77071321, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.14648438, + "step": 1868, + "time_per_iteration": 4.900072813034058 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.06035757, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.04948067344873762, + "language_loss": 0.84714514, + "learning_rate": 0.0007412262088623299, + "loss": 0.85806173, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.31274414, + "step": 1869, + "time_per_iteration": 2.72872257232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109305, + "balance_loss_mlp": 1.06255615, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.0631690153505957, + "language_loss": 0.79514921, + "learning_rate": 0.0007409532752346684, + "loss": 0.80607969, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.30444336, + "step": 1870, + "time_per_iteration": 2.646813154220581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084384, + "balance_loss_mlp": 1.05436683, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.05200384527654752, + "language_loss": 0.88430232, + "learning_rate": 0.0007406802480606491, + "loss": 0.89514613, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.29956055, + "step": 1871, + "time_per_iteration": 2.6335039138793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088571, + "balance_loss_mlp": 1.05819631, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.058340376963862656, + "language_loss": 0.90469301, + "learning_rate": 0.0007404071274462707, + "loss": 0.91557872, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.3034668, + "step": 1872, + "time_per_iteration": 2.579155206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088392, + "balance_loss_mlp": 1.05911398, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06288764850432389, + "language_loss": 0.83945811, + "learning_rate": 0.0007401339134975682, + "loss": 0.85034204, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.29272461, + "step": 1873, + "time_per_iteration": 2.6590254306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089736, + "balance_loss_mlp": 1.06024313, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.07025897777145818, + "language_loss": 0.84501064, + "learning_rate": 0.0007398606063206122, + "loss": 0.85590804, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.29467773, + "step": 1874, + "time_per_iteration": 2.6330654621124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084755, + "balance_loss_mlp": 1.05545354, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05525815693458704, + "language_loss": 0.78668261, + "learning_rate": 0.0007395872060215101, + "loss": 0.79753017, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.29296875, + "step": 1875, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087119, + "balance_loss_mlp": 1.05853248, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.05566722247490556, + "language_loss": 0.88191175, + "learning_rate": 0.0007393137127064056, + "loss": 0.89278299, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.28588867, + "step": 1876, + "time_per_iteration": 2.67520809173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083262, + "balance_loss_mlp": 1.05479455, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05183280051917729, + "language_loss": 0.84175742, + "learning_rate": 0.0007390401264814779, + "loss": 0.85258996, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.28491211, + "step": 1877, + "time_per_iteration": 2.621708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05559897, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.059598774698536174, + "language_loss": 0.84762645, + "learning_rate": 0.0007387664474529427, + "loss": 0.85846466, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.28222656, + "step": 1878, + "time_per_iteration": 2.64604115486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085745, + "balance_loss_mlp": 1.0567776, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.05278661870548292, + "language_loss": 0.90893793, + "learning_rate": 0.0007384926757270518, + "loss": 0.91979533, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.28955078, + "step": 1879, + "time_per_iteration": 2.63849139213562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094605, + "balance_loss_mlp": 1.0652554, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.05095981973878578, + "language_loss": 0.79965544, + "learning_rate": 0.0007382188114100924, + "loss": 0.81060153, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.29296875, + "step": 1880, + "time_per_iteration": 2.967137098312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096998, + "balance_loss_mlp": 1.06731534, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.0523610100033388, + "language_loss": 0.81541228, + "learning_rate": 0.0007379448546083884, + "loss": 0.82638228, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.29663086, + "step": 1881, + "time_per_iteration": 2.935075283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089574, + "balance_loss_mlp": 1.06036723, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.056326792126263736, + "language_loss": 0.88131809, + "learning_rate": 0.0007376708054282992, + "loss": 0.89221382, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.29174805, + "step": 1882, + "time_per_iteration": 2.9548256397247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080549, + "balance_loss_mlp": 1.05074644, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.053377968629185854, + "language_loss": 0.8395232, + "learning_rate": 0.0007373966639762201, + "loss": 0.85032874, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.29785156, + "step": 1883, + "time_per_iteration": 2.5978147983551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079871, + "balance_loss_mlp": 1.05085516, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.055969169447774005, + "language_loss": 0.88542271, + "learning_rate": 0.0007371224303585822, + "loss": 0.8962214, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.29003906, + "step": 1884, + "time_per_iteration": 2.573521614074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122192, + "balance_loss_mlp": 1.10817313, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.05390094690370155, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81479263, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.140625, + "step": 1885, + "time_per_iteration": 4.762617826461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077599, + "balance_loss_mlp": 1.04722452, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05279204841925659, + "language_loss": 0.8277564, + "learning_rate": 0.0007365736870525335, + "loss": 0.83853239, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.30322266, + "step": 1886, + "time_per_iteration": 2.8206799030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071958, + "balance_loss_mlp": 1.04182231, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.0631822735743998, + "language_loss": 0.82252121, + "learning_rate": 0.000736299177577164, + "loss": 0.83324087, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.30102539, + "step": 1887, + "time_per_iteration": 2.5644423961639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075611, + "balance_loss_mlp": 1.04516482, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.06952119877485304, + "language_loss": 0.83928037, + "learning_rate": 0.0007360245763623174, + "loss": 0.8500365, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.30395508, + "step": 1888, + "time_per_iteration": 2.68868088722229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076643, + "balance_loss_mlp": 1.04614949, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.05500458280543127, + "language_loss": 0.89759338, + "learning_rate": 0.0007357498835146039, + "loss": 0.90835977, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.30444336, + "step": 1889, + "time_per_iteration": 2.841135263442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078037, + "balance_loss_mlp": 1.04716182, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.05518095134274227, + "language_loss": 0.86945391, + "learning_rate": 0.0007354750991406684, + "loss": 0.8802343, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.30834961, + "step": 1890, + "time_per_iteration": 2.6954762935638428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04810333, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.060964398763012274, + "language_loss": 0.80524838, + "learning_rate": 0.0007352002233471919, + "loss": 0.81604487, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.31518555, + "step": 1891, + "time_per_iteration": 2.6167404651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079464, + "balance_loss_mlp": 1.04973292, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.06807309201777603, + "language_loss": 0.79092562, + "learning_rate": 0.0007349252562408906, + "loss": 0.80172026, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.296875, + "step": 1892, + "time_per_iteration": 2.6944479942321777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091379, + "balance_loss_mlp": 1.06071806, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.05563142804906438, + "language_loss": 0.81399196, + "learning_rate": 0.0007346501979285158, + "loss": 0.82490575, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.30615234, + "step": 1893, + "time_per_iteration": 2.8852903842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074867, + "balance_loss_mlp": 1.06208813, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02944776437417564, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8161397, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.12792969, + "step": 1894, + "time_per_iteration": 4.784174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114227, + "balance_loss_mlp": 1.0819447, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.051755500006301046, + "language_loss": 0.8558799, + "learning_rate": 0.0007340998081127308, + "loss": 0.86702216, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.32275391, + "step": 1895, + "time_per_iteration": 2.807494878768921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121943, + "balance_loss_mlp": 1.09023345, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.06567695066031824, + "language_loss": 0.90748346, + "learning_rate": 0.0007338244768230007, + "loss": 0.9187029, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.31689453, + "step": 1896, + "time_per_iteration": 2.7678794860839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118221, + "balance_loss_mlp": 1.08694077, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.07782470610585689, + "language_loss": 0.8913762, + "learning_rate": 0.0007335490547545578, + "loss": 0.90255845, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.3125, + "step": 1897, + "time_per_iteration": 3.0801138877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112607, + "balance_loss_mlp": 1.0822562, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.05264242736204855, + "language_loss": 0.82653165, + "learning_rate": 0.0007332735420143308, + "loss": 0.83765769, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.30297852, + "step": 1898, + "time_per_iteration": 2.7581489086151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094572, + "balance_loss_mlp": 1.06338716, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.06387883695900265, + "language_loss": 0.8681283, + "learning_rate": 0.0007329979387092826, + "loss": 0.87907398, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.31152344, + "step": 1899, + "time_per_iteration": 2.586489677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090283, + "balance_loss_mlp": 1.05964673, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.054083416077733606, + "language_loss": 0.83626556, + "learning_rate": 0.0007327222449464124, + "loss": 0.84716845, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.3059082, + "step": 1900, + "time_per_iteration": 3.2495076656341553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010843, + "balance_loss_mlp": 1.0518986, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.05500564094416643, + "language_loss": 0.88598847, + "learning_rate": 0.0007324464608327538, + "loss": 0.89683151, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.32397461, + "step": 1901, + "time_per_iteration": 2.617971897125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079363, + "balance_loss_mlp": 1.04786777, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.0538418205513684, + "language_loss": 0.88291639, + "learning_rate": 0.0007321705864753758, + "loss": 0.89371002, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.31469727, + "step": 1902, + "time_per_iteration": 2.69343638420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04294717, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.056477009868628435, + "language_loss": 0.84098166, + "learning_rate": 0.0007318946219813823, + "loss": 0.85172582, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.31469727, + "step": 1903, + "time_per_iteration": 3.010847568511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074916, + "balance_loss_mlp": 1.04232407, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05768945263904951, + "language_loss": 0.89714533, + "learning_rate": 0.000731618567457912, + "loss": 0.90789449, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.32592773, + "step": 1904, + "time_per_iteration": 2.6410703659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076588, + "balance_loss_mlp": 1.0440681, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05570087619571841, + "language_loss": 0.86445332, + "learning_rate": 0.000731342423012139, + "loss": 0.87521917, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.32519531, + "step": 1905, + "time_per_iteration": 3.054703712463379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.04312992, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.05663901457074664, + "language_loss": 0.82393479, + "learning_rate": 0.0007310661887512722, + "loss": 0.83468342, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.31713867, + "step": 1906, + "time_per_iteration": 3.0096654891967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076944, + "balance_loss_mlp": 1.04532969, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.07427377535541638, + "language_loss": 0.8207258, + "learning_rate": 0.0007307898647825549, + "loss": 0.83149529, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.31591797, + "step": 1907, + "time_per_iteration": 2.67525315284729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04347432, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.07021562329929035, + "language_loss": 0.89152002, + "learning_rate": 0.0007305134512132659, + "loss": 0.90227735, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.32250977, + "step": 1908, + "time_per_iteration": 2.6483352184295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0476923, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.07878350898766671, + "language_loss": 0.83255082, + "learning_rate": 0.0007302369481507183, + "loss": 0.84334129, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.31323242, + "step": 1909, + "time_per_iteration": 2.5106606483459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108859, + "balance_loss_mlp": 1.09207463, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.039316944601114644, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.8107062, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.16796875, + "step": 1910, + "time_per_iteration": 4.845642566680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073802, + "balance_loss_mlp": 1.04287899, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.05282525969479425, + "language_loss": 0.8551507, + "learning_rate": 0.000729683673975274, + "loss": 0.86588871, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.30883789, + "step": 1911, + "time_per_iteration": 2.643991470336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077837, + "balance_loss_mlp": 1.04648542, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.06579029503933971, + "language_loss": 0.83071077, + "learning_rate": 0.0007294069030771774, + "loss": 0.84148908, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.31323242, + "step": 1912, + "time_per_iteration": 3.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081127, + "balance_loss_mlp": 1.05053759, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055639286508135585, + "language_loss": 0.90529931, + "learning_rate": 0.0007291300431154224, + "loss": 0.91611063, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.30541992, + "step": 1913, + "time_per_iteration": 2.6364145278930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020102, + "balance_loss_mlp": 1.00503433, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.014819520409209537, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71409839, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.15039062, + "step": 1914, + "time_per_iteration": 4.986552000045776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089166, + "balance_loss_mlp": 1.05895889, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.07166131614104637, + "language_loss": 0.80129957, + "learning_rate": 0.0007285760564309179, + "loss": 0.81219125, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.30151367, + "step": 1915, + "time_per_iteration": 3.105180025100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082922, + "balance_loss_mlp": 1.05362058, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.07315246202889085, + "language_loss": 0.85023272, + "learning_rate": 0.0007282989299232448, + "loss": 0.86106199, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.29272461, + "step": 1916, + "time_per_iteration": 3.0501549243927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_mlp": 1.05710506, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.0682472178493412, + "language_loss": 0.83468378, + "learning_rate": 0.0007280217147820668, + "loss": 0.84554267, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.28735352, + "step": 1917, + "time_per_iteration": 2.61570143699646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.06836295, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.06368361877082852, + "language_loss": 0.79183483, + "learning_rate": 0.0007277444111150079, + "loss": 0.80280429, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.28613281, + "step": 1918, + "time_per_iteration": 2.7004950046539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108995, + "balance_loss_mlp": 1.06124449, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.07280537378335762, + "language_loss": 0.84052753, + "learning_rate": 0.0007274670190297272, + "loss": 0.85142708, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.28710938, + "step": 1919, + "time_per_iteration": 2.598128080368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06902122, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.05243134255501039, + "language_loss": 0.82081646, + "learning_rate": 0.0007271895386339179, + "loss": 0.83180475, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.29736328, + "step": 1920, + "time_per_iteration": 2.7843308448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093148, + "balance_loss_mlp": 1.06360769, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.058714378397154585, + "language_loss": 0.83102447, + "learning_rate": 0.0007269119700353073, + "loss": 0.8419559, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.29492188, + "step": 1921, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_mlp": 1.06052053, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04695414461356542, + "language_loss": 0.84780574, + "learning_rate": 0.0007266343133416571, + "loss": 0.85869944, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.28833008, + "step": 1922, + "time_per_iteration": 2.779585361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065569, + "balance_loss_mlp": 1.05011928, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.04139595668748732, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78182483, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.15429688, + "step": 1923, + "time_per_iteration": 4.841213703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085527, + "balance_loss_mlp": 1.05591547, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.07673769099321799, + "language_loss": 0.84293365, + "learning_rate": 0.0007260787361004556, + "loss": 0.85378897, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.2956543, + "step": 1924, + "time_per_iteration": 2.5501017570495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_mlp": 1.00875258, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.01226438472350035, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74784565, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.14257812, + "step": 1925, + "time_per_iteration": 4.9058191776275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05040073, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.0733591012555623, + "language_loss": 0.87266588, + "learning_rate": 0.0007255228077730903, + "loss": 0.88345671, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.28686523, + "step": 1926, + "time_per_iteration": 2.6776785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080805, + "balance_loss_mlp": 1.05281413, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.05143591599053885, + "language_loss": 0.81313562, + "learning_rate": 0.0007252447122218632, + "loss": 0.82394373, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.2800293, + "step": 1927, + "time_per_iteration": 3.1710472106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_mlp": 1.04907489, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.07597924069729044, + "language_loss": 0.88653511, + "learning_rate": 0.0007249665292228834, + "loss": 0.89731288, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.28686523, + "step": 1928, + "time_per_iteration": 2.580092191696167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108352, + "balance_loss_mlp": 1.0547905, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.05796370091963761, + "language_loss": 0.8379482, + "learning_rate": 0.000724688258884151, + "loss": 0.84878337, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.28710938, + "step": 1929, + "time_per_iteration": 2.6322267055511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_mlp": 1.05740142, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.049384577339976525, + "language_loss": 0.86327779, + "learning_rate": 0.0007244099013137002, + "loss": 0.87413883, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.28710938, + "step": 1930, + "time_per_iteration": 3.09224009513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087908, + "balance_loss_mlp": 1.05951214, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.06129670734370297, + "language_loss": 0.88767004, + "learning_rate": 0.0007241314566195993, + "loss": 0.89854914, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.28393555, + "step": 1931, + "time_per_iteration": 3.238381862640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094186, + "balance_loss_mlp": 1.06531322, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.05545779345638414, + "language_loss": 0.85434037, + "learning_rate": 0.0007238529249099496, + "loss": 0.86528224, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.28833008, + "step": 1932, + "time_per_iteration": 2.632279872894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159138, + "balance_loss_mlp": 1.1475507, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.054961579821259376, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.79016018, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.11572266, + "step": 1933, + "time_per_iteration": 4.920037746429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098131, + "balance_loss_mlp": 1.06902027, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.06411393233522368, + "language_loss": 0.80432916, + "learning_rate": 0.000723295600876581, + "loss": 0.81531054, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.29101562, + "step": 1934, + "time_per_iteration": 3.060438632965088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093478, + "balance_loss_mlp": 1.06510615, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.054125512250282885, + "language_loss": 0.87856102, + "learning_rate": 0.0007230168087692344, + "loss": 0.88949579, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.28393555, + "step": 1935, + "time_per_iteration": 2.655176877975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095042, + "balance_loss_mlp": 1.06607461, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.053712544631880174, + "language_loss": 0.82501912, + "learning_rate": 0.0007227379300790839, + "loss": 0.83596957, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.28955078, + "step": 1936, + "time_per_iteration": 3.05722713470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086223, + "balance_loss_mlp": 1.05668318, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.05452705072121448, + "language_loss": 0.85148442, + "learning_rate": 0.0007224589649143997, + "loss": 0.86234665, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.29492188, + "step": 1937, + "time_per_iteration": 2.593818187713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089067, + "balance_loss_mlp": 1.06021869, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08689315573767935, + "language_loss": 0.80660325, + "learning_rate": 0.0007221799133834861, + "loss": 0.81749392, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.28833008, + "step": 1938, + "time_per_iteration": 2.6238772869110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087089, + "balance_loss_mlp": 1.05869377, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.06550449761554421, + "language_loss": 0.81904262, + "learning_rate": 0.00072190077559468, + "loss": 0.8299135, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.28417969, + "step": 1939, + "time_per_iteration": 2.5338878631591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086155, + "balance_loss_mlp": 1.05649543, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.05171807924061888, + "language_loss": 0.89000612, + "learning_rate": 0.0007216215516563527, + "loss": 0.90086764, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.29589844, + "step": 1940, + "time_per_iteration": 2.717912435531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083323, + "balance_loss_mlp": 1.05449796, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.06398735943962416, + "language_loss": 0.83462608, + "learning_rate": 0.0007213422416769083, + "loss": 0.84545934, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.28808594, + "step": 1941, + "time_per_iteration": 2.6354072093963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107949, + "balance_loss_mlp": 1.0511179, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05310409823342424, + "language_loss": 0.75118601, + "learning_rate": 0.0007210628457647849, + "loss": 0.76198089, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.28369141, + "step": 1942, + "time_per_iteration": 2.573251724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080746, + "balance_loss_mlp": 1.05118251, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.05561530112530558, + "language_loss": 0.78689432, + "learning_rate": 0.000720783364028453, + "loss": 0.79770184, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.29516602, + "step": 1943, + "time_per_iteration": 2.782897472381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078848, + "balance_loss_mlp": 1.04935515, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05583674557333592, + "language_loss": 0.87426305, + "learning_rate": 0.0007205037965764177, + "loss": 0.88505149, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.29467773, + "step": 1944, + "time_per_iteration": 2.577195167541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_mlp": 1.04740369, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05970518460248593, + "language_loss": 0.8568424, + "learning_rate": 0.0007202241435172161, + "loss": 0.86760962, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.29296875, + "step": 1945, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04849827, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.057784843601785166, + "language_loss": 0.88219595, + "learning_rate": 0.0007199444049594198, + "loss": 0.89296943, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.28833008, + "step": 1946, + "time_per_iteration": 2.997744560241699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075997, + "balance_loss_mlp": 1.04681468, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.05996621635377081, + "language_loss": 0.83343232, + "learning_rate": 0.0007196645810116322, + "loss": 0.84419227, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.29150391, + "step": 1947, + "time_per_iteration": 2.6596434116363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071198, + "balance_loss_mlp": 1.04308891, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.07792528533349045, + "language_loss": 0.8387686, + "learning_rate": 0.0007193846717824912, + "loss": 0.84948057, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.28149414, + "step": 1948, + "time_per_iteration": 2.87357759475708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04031014, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06284621907245236, + "language_loss": 0.88014293, + "learning_rate": 0.0007191046773806669, + "loss": 0.89082038, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.27514648, + "step": 1949, + "time_per_iteration": 2.616118907928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073776, + "balance_loss_mlp": 1.04473686, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06080214721481266, + "language_loss": 0.83072305, + "learning_rate": 0.0007188245979148631, + "loss": 0.84146082, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.29003906, + "step": 1950, + "time_per_iteration": 3.212918281555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05164886, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.06034460157863772, + "language_loss": 0.87560785, + "learning_rate": 0.0007185444334938157, + "loss": 0.88641185, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.28735352, + "step": 1951, + "time_per_iteration": 2.6847927570343018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_mlp": 1.04635811, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.07362347851216991, + "language_loss": 0.85023165, + "learning_rate": 0.0007182641842262947, + "loss": 0.86097872, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.28320312, + "step": 1952, + "time_per_iteration": 2.6011481285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080682, + "balance_loss_mlp": 1.05252457, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.05143100601063952, + "language_loss": 0.77525514, + "learning_rate": 0.0007179838502211022, + "loss": 0.78606194, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.28198242, + "step": 1953, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082839, + "balance_loss_mlp": 1.05487227, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.06528688845841664, + "language_loss": 0.86487108, + "learning_rate": 0.0007177034315870738, + "loss": 0.87569952, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.27978516, + "step": 1954, + "time_per_iteration": 2.9551377296447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04896057, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.059767476828271, + "language_loss": 0.90968794, + "learning_rate": 0.0007174229284330773, + "loss": 0.9204582, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.28076172, + "step": 1955, + "time_per_iteration": 2.5916919708251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.0481143, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.06317358450106399, + "language_loss": 0.87043428, + "learning_rate": 0.0007171423408680141, + "loss": 0.88119459, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.27954102, + "step": 1956, + "time_per_iteration": 2.8243377208709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.04352272, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.057758823731725896, + "language_loss": 0.89565909, + "learning_rate": 0.0007168616690008176, + "loss": 0.90638542, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.29125977, + "step": 1957, + "time_per_iteration": 2.6314306259155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_mlp": 1.04572916, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.055146864479517985, + "language_loss": 0.86279052, + "learning_rate": 0.0007165809129404545, + "loss": 0.87353098, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.28320312, + "step": 1958, + "time_per_iteration": 2.7625439167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074993, + "balance_loss_mlp": 1.044595, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.06141204693847206, + "language_loss": 0.85977095, + "learning_rate": 0.0007163000727959239, + "loss": 0.87052089, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.30371094, + "step": 1959, + "time_per_iteration": 2.473407506942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061387, + "balance_loss_mlp": 1.04622388, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.02935416999593297, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79020452, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.15136719, + "step": 1960, + "time_per_iteration": 4.8784215450286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079277, + "balance_loss_mlp": 1.04973722, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.05722982355969982, + "language_loss": 0.84446192, + "learning_rate": 0.00071573814069052, + "loss": 0.85525477, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.29541016, + "step": 1961, + "time_per_iteration": 2.929955244064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078902, + "balance_loss_mlp": 1.05031538, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.053564242831421076, + "language_loss": 0.88053226, + "learning_rate": 0.0007154570489478081, + "loss": 0.8913213, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.28540039, + "step": 1962, + "time_per_iteration": 3.1691505908966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079242, + "balance_loss_mlp": 1.05001187, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.05213464978332433, + "language_loss": 0.86570239, + "learning_rate": 0.0007151758735572514, + "loss": 0.87649477, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.29174805, + "step": 1963, + "time_per_iteration": 2.9893381595611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_mlp": 1.05190408, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06256473208381459, + "language_loss": 0.80730724, + "learning_rate": 0.0007148946146280119, + "loss": 0.81811094, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.28442383, + "step": 1964, + "time_per_iteration": 2.8270015716552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015118, + "balance_loss_mlp": 1.00214851, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.01808471901321765, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73207271, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.12988281, + "step": 1965, + "time_per_iteration": 4.895836353302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018206, + "balance_loss_mlp": 1.00561714, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.021930840707602553, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76360154, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.12597656, + "step": 1966, + "time_per_iteration": 5.0023956298828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091314, + "balance_loss_mlp": 1.06358576, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.04479252262380658, + "language_loss": 0.83477217, + "learning_rate": 0.0007140503377003022, + "loss": 0.84568524, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.27734375, + "step": 1967, + "time_per_iteration": 3.0142691135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097939, + "balance_loss_mlp": 1.07011509, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.049620821678558774, + "language_loss": 0.8500334, + "learning_rate": 0.000713768745708599, + "loss": 0.86101276, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.27856445, + "step": 1968, + "time_per_iteration": 2.6556408405303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109518, + "balance_loss_mlp": 1.06807137, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.05249502952466034, + "language_loss": 0.7739228, + "learning_rate": 0.0007134870707245085, + "loss": 0.78487462, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.27148438, + "step": 1969, + "time_per_iteration": 3.2944319248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097317, + "balance_loss_mlp": 1.0706377, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06611086672726225, + "language_loss": 0.84358507, + "learning_rate": 0.0007132053128573864, + "loss": 0.85455823, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.26733398, + "step": 1970, + "time_per_iteration": 2.745910167694092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.07422984, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.07389156257299019, + "language_loss": 0.83986598, + "learning_rate": 0.0007129234722166211, + "loss": 0.8508774, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.26977539, + "step": 1971, + "time_per_iteration": 2.8552701473236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095612, + "balance_loss_mlp": 1.06881404, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.0464186232668544, + "language_loss": 0.90731955, + "learning_rate": 0.0007126415489116328, + "loss": 0.91827571, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.26818848, + "step": 1972, + "time_per_iteration": 2.6738507747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089531, + "balance_loss_mlp": 1.06185079, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05397666452651625, + "language_loss": 0.81034803, + "learning_rate": 0.0007123595430518736, + "loss": 0.82124341, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.27685547, + "step": 1973, + "time_per_iteration": 2.8551318645477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089653, + "balance_loss_mlp": 1.06225908, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07183677804285386, + "language_loss": 0.86159599, + "learning_rate": 0.0007120774547468282, + "loss": 0.87249249, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.27416992, + "step": 1974, + "time_per_iteration": 2.5466248989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091836, + "balance_loss_mlp": 1.06477594, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.057862181788604236, + "language_loss": 0.81643212, + "learning_rate": 0.0007117952841060128, + "loss": 0.82735044, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.27099609, + "step": 1975, + "time_per_iteration": 2.6863863468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010857, + "balance_loss_mlp": 1.05813885, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.06251241790432795, + "language_loss": 0.83861643, + "learning_rate": 0.0007115130312389756, + "loss": 0.84947342, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.27587891, + "step": 1976, + "time_per_iteration": 2.6821115016937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088536, + "balance_loss_mlp": 1.0602119, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.063889045898505, + "language_loss": 0.79037011, + "learning_rate": 0.0007112306962552973, + "loss": 0.80125546, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.28320312, + "step": 1977, + "time_per_iteration": 2.5958874225616455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086598, + "balance_loss_mlp": 1.05877423, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055122671956433805, + "language_loss": 0.85178941, + "learning_rate": 0.0007109482792645896, + "loss": 0.8626554, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.27832031, + "step": 1978, + "time_per_iteration": 2.706073760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081892, + "balance_loss_mlp": 1.05363917, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06407360303991923, + "language_loss": 0.83617824, + "learning_rate": 0.0007106657803764969, + "loss": 0.84699714, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.2824707, + "step": 1979, + "time_per_iteration": 2.7429239749908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078619, + "balance_loss_mlp": 1.05022287, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.07177583644367627, + "language_loss": 0.8165133, + "learning_rate": 0.0007103831997006948, + "loss": 0.82729954, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.28393555, + "step": 1980, + "time_per_iteration": 2.7360527515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072489, + "balance_loss_mlp": 1.04361689, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.06360208542685557, + "language_loss": 0.85186386, + "learning_rate": 0.0007101005373468908, + "loss": 0.86258882, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.28833008, + "step": 1981, + "time_per_iteration": 2.925529718399048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03775024, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.051682910059599525, + "language_loss": 0.86574209, + "learning_rate": 0.0007098177934248242, + "loss": 0.87640351, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.28369141, + "step": 1982, + "time_per_iteration": 2.7813186645507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066502, + "balance_loss_mlp": 1.03770101, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.06153978169673806, + "language_loss": 0.85434651, + "learning_rate": 0.0007095349680442661, + "loss": 0.86501151, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.2878418, + "step": 1983, + "time_per_iteration": 2.878678321838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.04062414, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.05550499316869274, + "language_loss": 0.78828371, + "learning_rate": 0.0007092520613150188, + "loss": 0.79897726, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.28710938, + "step": 1984, + "time_per_iteration": 2.667602300643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04057729, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.04940974411679134, + "language_loss": 0.81105816, + "learning_rate": 0.0007089690733469165, + "loss": 0.82175809, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.29394531, + "step": 1985, + "time_per_iteration": 2.7445921897888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077693, + "balance_loss_mlp": 1.04924965, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.0710841944315155, + "language_loss": 0.82154202, + "learning_rate": 0.000708686004249825, + "loss": 0.8323189, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.28442383, + "step": 1986, + "time_per_iteration": 2.803262948989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075438, + "balance_loss_mlp": 1.0459218, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053095768122865476, + "language_loss": 0.91283715, + "learning_rate": 0.0007084028541336413, + "loss": 0.92359161, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.29467773, + "step": 1987, + "time_per_iteration": 2.693894147872925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075901, + "balance_loss_mlp": 1.04807711, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.04978295407195845, + "language_loss": 0.86100876, + "learning_rate": 0.0007081196231082942, + "loss": 0.87176782, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.27807617, + "step": 1988, + "time_per_iteration": 2.8127198219299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05097318, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05417702481979702, + "language_loss": 0.80060172, + "learning_rate": 0.0007078363112837436, + "loss": 0.81139255, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.28125, + "step": 1989, + "time_per_iteration": 2.8839027881622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077016, + "balance_loss_mlp": 1.04866838, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.05590772319077314, + "language_loss": 0.84895635, + "learning_rate": 0.000707552918769981, + "loss": 0.85972643, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.28344727, + "step": 1990, + "time_per_iteration": 2.4921815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075886, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.05219115858491499, + "language_loss": 0.8389315, + "learning_rate": 0.000707269445677029, + "loss": 0.84969032, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.27563477, + "step": 1991, + "time_per_iteration": 2.716742753982544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080405, + "balance_loss_mlp": 1.05205727, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.061454112768806295, + "language_loss": 0.85369635, + "learning_rate": 0.0007069858921149416, + "loss": 0.8645004, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.28344727, + "step": 1992, + "time_per_iteration": 2.953749418258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077015, + "balance_loss_mlp": 1.04919195, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.04324001999537677, + "language_loss": 0.86024761, + "learning_rate": 0.0007067022581938043, + "loss": 0.87101781, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.27880859, + "step": 1993, + "time_per_iteration": 2.818094491958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072064, + "balance_loss_mlp": 1.04502726, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06003802076808944, + "language_loss": 0.83055973, + "learning_rate": 0.0007064185440237334, + "loss": 0.84128034, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.27075195, + "step": 1994, + "time_per_iteration": 2.7304775714874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.05043745, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.054248337050939024, + "language_loss": 0.84367561, + "learning_rate": 0.0007061347497148764, + "loss": 0.85445797, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.27807617, + "step": 1995, + "time_per_iteration": 2.747483015060425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074409, + "balance_loss_mlp": 1.04706264, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06054830939074019, + "language_loss": 0.86660719, + "learning_rate": 0.0007058508753774122, + "loss": 0.87735128, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.27392578, + "step": 1996, + "time_per_iteration": 2.6960108280181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078362, + "balance_loss_mlp": 1.05165958, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.05196412840141252, + "language_loss": 0.86974967, + "learning_rate": 0.0007055669211215505, + "loss": 0.88053334, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.26733398, + "step": 1997, + "time_per_iteration": 2.6327381134033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076337, + "balance_loss_mlp": 1.04775071, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06669720231739994, + "language_loss": 0.77213579, + "learning_rate": 0.0007052828870575322, + "loss": 0.78289914, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.28588867, + "step": 1998, + "time_per_iteration": 2.6813313961029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085238, + "balance_loss_mlp": 1.05808222, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.053007093293579055, + "language_loss": 0.8636111, + "learning_rate": 0.0007049987732956291, + "loss": 0.87446344, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.27197266, + "step": 1999, + "time_per_iteration": 2.9743165969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071346, + "balance_loss_mlp": 1.04323626, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.046114011394728885, + "language_loss": 0.82846403, + "learning_rate": 0.0007047145799461439, + "loss": 0.83917749, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.28149414, + "step": 2000, + "time_per_iteration": 2.85295033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077125, + "balance_loss_mlp": 1.0488013, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.06118237782788499, + "language_loss": 0.8185212, + "learning_rate": 0.00070443030711941, + "loss": 0.82929248, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.28295898, + "step": 2001, + "time_per_iteration": 2.7602195739746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078319, + "balance_loss_mlp": 1.04918385, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.06801983854699947, + "language_loss": 0.82348108, + "learning_rate": 0.0007041459549257924, + "loss": 0.83426422, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.29101562, + "step": 2002, + "time_per_iteration": 2.8562166690826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.04565787, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.07124544558687326, + "language_loss": 0.7826004, + "learning_rate": 0.0007038615234756859, + "loss": 0.79334354, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.28662109, + "step": 2003, + "time_per_iteration": 3.1888484954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071637, + "balance_loss_mlp": 1.0429796, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.060193135665447615, + "language_loss": 0.83578098, + "learning_rate": 0.000703577012879517, + "loss": 0.8464973, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.28662109, + "step": 2004, + "time_per_iteration": 2.6438684463500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069967, + "balance_loss_mlp": 1.04185688, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.05830751128665357, + "language_loss": 0.8852784, + "learning_rate": 0.0007032924232477423, + "loss": 0.89597809, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.28149414, + "step": 2005, + "time_per_iteration": 2.6632285118103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071337, + "balance_loss_mlp": 1.04253602, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.05522600702951118, + "language_loss": 0.8025552, + "learning_rate": 0.0007030077546908493, + "loss": 0.81326854, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.28808594, + "step": 2006, + "time_per_iteration": 2.6748647689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107831, + "balance_loss_mlp": 1.06600749, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.04192005891791234, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84142971, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12255859, + "step": 2007, + "time_per_iteration": 4.758062124252319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084632, + "balance_loss_mlp": 1.05614078, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.06495221526254255, + "language_loss": 0.79320729, + "learning_rate": 0.0007024381812438117, + "loss": 0.80405354, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.28515625, + "step": 2008, + "time_per_iteration": 2.557239532470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095356, + "balance_loss_mlp": 1.06607771, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.09570560546772983, + "language_loss": 0.83017313, + "learning_rate": 0.0007021532765747951, + "loss": 0.84112668, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.29248047, + "step": 2009, + "time_per_iteration": 2.984100580215454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088616, + "balance_loss_mlp": 1.06031561, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05400711762269546, + "language_loss": 0.78963518, + "learning_rate": 0.0007018682934229162, + "loss": 0.80052131, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.28295898, + "step": 2010, + "time_per_iteration": 2.9302892684936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080883, + "balance_loss_mlp": 1.05220175, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05212566321061033, + "language_loss": 0.82523775, + "learning_rate": 0.0007015832318988152, + "loss": 0.83604658, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.28662109, + "step": 2011, + "time_per_iteration": 2.65934157371521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_mlp": 1.0158205, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.016832038405886617, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74917436, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11523438, + "step": 2012, + "time_per_iteration": 4.964378595352173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076687, + "balance_loss_mlp": 1.04776716, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.05730560331399072, + "language_loss": 0.83868068, + "learning_rate": 0.0007010128741766604, + "loss": 0.84944755, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.28857422, + "step": 2013, + "time_per_iteration": 2.7196977138519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_mlp": 1.04005277, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.0608937159393576, + "language_loss": 0.843593, + "learning_rate": 0.0007007275782000391, + "loss": 0.85428894, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.29492188, + "step": 2014, + "time_per_iteration": 2.635704517364502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072342, + "balance_loss_mlp": 1.04351759, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.061731808628827385, + "language_loss": 0.84906852, + "learning_rate": 0.0007004422042940605, + "loss": 0.85979199, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.2878418, + "step": 2015, + "time_per_iteration": 2.500502824783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072405, + "balance_loss_mlp": 1.04246008, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.06410146749924231, + "language_loss": 0.89413089, + "learning_rate": 0.0007001567525695169, + "loss": 0.90485489, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.29931641, + "step": 2016, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072622, + "balance_loss_mlp": 1.04410672, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.057933083917186774, + "language_loss": 0.83612067, + "learning_rate": 0.0006998712231372303, + "loss": 0.84684694, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.28491211, + "step": 2017, + "time_per_iteration": 3.0175724029541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04141831, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.04866320553491467, + "language_loss": 0.86211008, + "learning_rate": 0.0006995856161080532, + "loss": 0.87281585, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.29101562, + "step": 2018, + "time_per_iteration": 2.879014015197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071313, + "balance_loss_mlp": 1.04193974, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.05910223086818918, + "language_loss": 0.81994784, + "learning_rate": 0.0006992999315928679, + "loss": 0.83066106, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.29345703, + "step": 2019, + "time_per_iteration": 2.794605255126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078638, + "balance_loss_mlp": 1.04826391, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.0551019421553566, + "language_loss": 0.86098075, + "learning_rate": 0.0006990141697025871, + "loss": 0.8717671, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.3034668, + "step": 2020, + "time_per_iteration": 2.808492422103882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056045, + "balance_loss_mlp": 1.04388523, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.03291843471702338, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77415681, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12158203, + "step": 2021, + "time_per_iteration": 4.747381687164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04109025, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.0700535467402408, + "language_loss": 0.82436341, + "learning_rate": 0.0006984424142405392, + "loss": 0.83506376, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.28930664, + "step": 2022, + "time_per_iteration": 2.8081154823303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070367, + "balance_loss_mlp": 1.04144704, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06604387927811756, + "language_loss": 0.81889653, + "learning_rate": 0.0006981564208907474, + "loss": 0.82960021, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.2890625, + "step": 2023, + "time_per_iteration": 2.615868091583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067731, + "balance_loss_mlp": 1.03947854, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.05337785231387105, + "language_loss": 0.90169919, + "learning_rate": 0.0006978703506098102, + "loss": 0.91237652, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.2824707, + "step": 2024, + "time_per_iteration": 2.7487242221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04292357, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.05102180718564601, + "language_loss": 0.87631416, + "learning_rate": 0.00069758420350879, + "loss": 0.88702166, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.27832031, + "step": 2025, + "time_per_iteration": 2.6278607845306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03802657, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.05496821729843788, + "language_loss": 0.85941356, + "learning_rate": 0.000697297979698779, + "loss": 0.87007421, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.28051758, + "step": 2026, + "time_per_iteration": 2.773711919784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072256, + "balance_loss_mlp": 1.0449574, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.054849440695872026, + "language_loss": 0.83735013, + "learning_rate": 0.0006970116792908992, + "loss": 0.84807271, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.27368164, + "step": 2027, + "time_per_iteration": 3.1274263858795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071715, + "balance_loss_mlp": 1.04348612, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.0501662810644282, + "language_loss": 0.80959415, + "learning_rate": 0.000696725302396302, + "loss": 0.82031131, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.28222656, + "step": 2028, + "time_per_iteration": 2.653289318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078388, + "balance_loss_mlp": 1.050946, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.053195529027894116, + "language_loss": 0.85790342, + "learning_rate": 0.0006964388491261692, + "loss": 0.86868727, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.2746582, + "step": 2029, + "time_per_iteration": 3.2972047328948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082882, + "balance_loss_mlp": 1.0550828, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.06114884672927749, + "language_loss": 0.87352717, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435602, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.27832031, + "step": 2030, + "time_per_iteration": 2.8415944576263428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083514, + "balance_loss_mlp": 1.0548079, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.056999957489140544, + "language_loss": 0.78065526, + "learning_rate": 0.0006958657139041696, + "loss": 0.79149044, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.28686523, + "step": 2031, + "time_per_iteration": 2.750596761703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_mlp": 1.01660919, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.015090316928766313, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77740502, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.109375, + "step": 2032, + "time_per_iteration": 4.916932106018066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080774, + "balance_loss_mlp": 1.05371356, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.058882626995900515, + "language_loss": 0.77978921, + "learning_rate": 0.0006952922745149434, + "loss": 0.7905969, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.27099609, + "step": 2033, + "time_per_iteration": 2.6288254261016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076329, + "balance_loss_mlp": 1.04802871, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.059683993490508125, + "language_loss": 0.8774389, + "learning_rate": 0.000695005441035888, + "loss": 0.88820225, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.28295898, + "step": 2034, + "time_per_iteration": 2.6451032161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021075, + "balance_loss_mlp": 1.01001287, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.012767183735830537, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74744511, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11083984, + "step": 2035, + "time_per_iteration": 4.875540018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05346835, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.05871453648610719, + "language_loss": 0.8120997, + "learning_rate": 0.0006944315470656863, + "loss": 0.82291067, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.27685547, + "step": 2036, + "time_per_iteration": 2.9991486072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079422, + "balance_loss_mlp": 1.05193281, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05954449002694624, + "language_loss": 0.90806162, + "learning_rate": 0.000694144486797345, + "loss": 0.91885585, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.27539062, + "step": 2037, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016452, + "balance_loss_mlp": 1.00543678, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.010331538207496795, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80536884, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.11035156, + "step": 2038, + "time_per_iteration": 4.696615695953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077334, + "balance_loss_mlp": 1.04920101, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.05886678367995608, + "language_loss": 0.89078939, + "learning_rate": 0.0006935701402514156, + "loss": 0.90156269, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.28149414, + "step": 2039, + "time_per_iteration": 2.555340051651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00254571, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.009976601144167605, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74048454, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.11035156, + "step": 2040, + "time_per_iteration": 4.91499400138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04941869, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.0656092448350418, + "language_loss": 0.84421289, + "learning_rate": 0.0006929954931031422, + "loss": 0.8549906, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.28344727, + "step": 2041, + "time_per_iteration": 3.729060649871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079221, + "balance_loss_mlp": 1.0521127, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05672023255092622, + "language_loss": 0.88579351, + "learning_rate": 0.0006927080570819805, + "loss": 0.8965857, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.27148438, + "step": 2042, + "time_per_iteration": 2.5964105129241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05557048, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.07129276434353096, + "language_loss": 0.81115568, + "learning_rate": 0.0006924205462449161, + "loss": 0.82197881, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.26806641, + "step": 2043, + "time_per_iteration": 2.585873603820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080679, + "balance_loss_mlp": 1.0537734, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.07610386660927036, + "language_loss": 0.8177464, + "learning_rate": 0.0006921329607035702, + "loss": 0.8285532, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.26940918, + "step": 2044, + "time_per_iteration": 3.238981246948242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087504, + "balance_loss_mlp": 1.0611347, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.0570655681013956, + "language_loss": 0.87757248, + "learning_rate": 0.0006918453005695938, + "loss": 0.88844752, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.26416016, + "step": 2045, + "time_per_iteration": 2.6602108478546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091027, + "balance_loss_mlp": 1.06491971, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.055879562404771856, + "language_loss": 0.84307766, + "learning_rate": 0.0006915575659546662, + "loss": 0.85398793, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.26147461, + "step": 2046, + "time_per_iteration": 2.6592600345611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_mlp": 1.06476951, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.06494345942268129, + "language_loss": 0.80426449, + "learning_rate": 0.0006912697569704959, + "loss": 0.81517833, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.26623535, + "step": 2047, + "time_per_iteration": 2.613070011138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080678, + "balance_loss_mlp": 1.0539515, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.06871552578761372, + "language_loss": 0.86815077, + "learning_rate": 0.0006909818737288205, + "loss": 0.87895757, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.26745605, + "step": 2048, + "time_per_iteration": 2.5862643718719482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05919969, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.055462609864315775, + "language_loss": 0.80754077, + "learning_rate": 0.000690693916341406, + "loss": 0.81840289, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.27075195, + "step": 2049, + "time_per_iteration": 2.668114185333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010802, + "balance_loss_mlp": 1.0532347, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.05123788091691057, + "language_loss": 0.8241666, + "learning_rate": 0.0006904058849200475, + "loss": 0.83496863, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.27001953, + "step": 2050, + "time_per_iteration": 2.7161009311676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084281, + "balance_loss_mlp": 1.05679107, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.06391064418382593, + "language_loss": 0.84741384, + "learning_rate": 0.0006901177795765683, + "loss": 0.8582567, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.27514648, + "step": 2051, + "time_per_iteration": 2.6012356281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082278, + "balance_loss_mlp": 1.05540872, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.059538956745971455, + "language_loss": 0.8114661, + "learning_rate": 0.0006898296004228213, + "loss": 0.82228893, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.26879883, + "step": 2052, + "time_per_iteration": 2.739016056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091682, + "balance_loss_mlp": 1.07909358, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.0435951911950544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79218423, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12597656, + "step": 2053, + "time_per_iteration": 4.853093385696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.0498004, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.061585922129253, + "language_loss": 0.79790258, + "learning_rate": 0.0006892530211320763, + "loss": 0.80867237, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.2722168, + "step": 2054, + "time_per_iteration": 2.695810317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077935, + "balance_loss_mlp": 1.05135143, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06739666157176663, + "language_loss": 0.83483803, + "learning_rate": 0.000688964621218926, + "loss": 0.84561741, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.26611328, + "step": 2055, + "time_per_iteration": 2.5957767963409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070974, + "balance_loss_mlp": 1.04496288, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05900978816729325, + "language_loss": 0.79760778, + "learning_rate": 0.0006886761479432037, + "loss": 0.80831754, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.26037598, + "step": 2056, + "time_per_iteration": 2.823195457458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075075, + "balance_loss_mlp": 1.0479672, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.06325658180551426, + "language_loss": 0.84495139, + "learning_rate": 0.0006883876014169045, + "loss": 0.85570216, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.27148438, + "step": 2057, + "time_per_iteration": 2.504899263381958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05080771, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05952155235087993, + "language_loss": 0.90666497, + "learning_rate": 0.000688098981752052, + "loss": 0.91744673, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.27441406, + "step": 2058, + "time_per_iteration": 2.705845832824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079753, + "balance_loss_mlp": 1.05207229, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.057037005783434964, + "language_loss": 0.80068249, + "learning_rate": 0.0006878102890606982, + "loss": 0.81147999, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.27709961, + "step": 2059, + "time_per_iteration": 3.086745500564575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108134, + "balance_loss_mlp": 1.0542556, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.07822530462482143, + "language_loss": 0.80866635, + "learning_rate": 0.0006875215234549239, + "loss": 0.8194797, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.27124023, + "step": 2060, + "time_per_iteration": 2.5814599990844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080083, + "balance_loss_mlp": 1.05221188, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.06673254145899743, + "language_loss": 0.85142004, + "learning_rate": 0.0006872326850468376, + "loss": 0.86222088, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.27880859, + "step": 2061, + "time_per_iteration": 2.6693742275238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081472, + "balance_loss_mlp": 1.05343366, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.06184749895138045, + "language_loss": 0.78875667, + "learning_rate": 0.0006869437739485762, + "loss": 0.79957139, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.28051758, + "step": 2062, + "time_per_iteration": 2.612020969390869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108316, + "balance_loss_mlp": 1.05493176, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.07174128592683177, + "language_loss": 0.92295337, + "learning_rate": 0.0006866547902723053, + "loss": 0.93378496, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.2824707, + "step": 2063, + "time_per_iteration": 2.676013469696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108135, + "balance_loss_mlp": 1.05300224, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05898261192449876, + "language_loss": 0.80094039, + "learning_rate": 0.000686365734130218, + "loss": 0.81175387, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.28369141, + "step": 2064, + "time_per_iteration": 2.7021024227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071448, + "balance_loss_mlp": 1.0426228, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.09101918864834832, + "language_loss": 0.83948302, + "learning_rate": 0.000686076605634536, + "loss": 0.85019755, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.28808594, + "step": 2065, + "time_per_iteration": 2.6558356285095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068247, + "balance_loss_mlp": 1.03963661, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.05840936356543045, + "language_loss": 0.83999312, + "learning_rate": 0.0006857874048975088, + "loss": 0.85067558, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.28613281, + "step": 2066, + "time_per_iteration": 2.556900978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068316, + "balance_loss_mlp": 1.04027796, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.07585091480167282, + "language_loss": 0.87176585, + "learning_rate": 0.0006854981320314142, + "loss": 0.88244903, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.28027344, + "step": 2067, + "time_per_iteration": 2.445798635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04426003, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.08763476788371415, + "language_loss": 0.86982906, + "learning_rate": 0.0006852087871485579, + "loss": 0.88055265, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.28125, + "step": 2068, + "time_per_iteration": 2.6390161514282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076434, + "balance_loss_mlp": 1.04861069, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.065510260101048, + "language_loss": 0.82088625, + "learning_rate": 0.0006849193703612735, + "loss": 0.83165061, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.27856445, + "step": 2069, + "time_per_iteration": 2.763023614883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_mlp": 1.04346275, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.058439166966186944, + "language_loss": 0.77565378, + "learning_rate": 0.0006846298817819225, + "loss": 0.78636372, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.27563477, + "step": 2070, + "time_per_iteration": 2.948054790496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070331, + "balance_loss_mlp": 1.04296088, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.06370866866163034, + "language_loss": 0.80921137, + "learning_rate": 0.0006843403215228945, + "loss": 0.8199147, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.27392578, + "step": 2071, + "time_per_iteration": 2.440274953842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075017, + "balance_loss_mlp": 1.04771829, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.05754797735781241, + "language_loss": 0.80491692, + "learning_rate": 0.0006840506896966065, + "loss": 0.81566709, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.2734375, + "step": 2072, + "time_per_iteration": 2.7141849994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076402, + "balance_loss_mlp": 1.04874492, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.06436648215160112, + "language_loss": 0.82351565, + "learning_rate": 0.0006837609864155038, + "loss": 0.83427966, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.27685547, + "step": 2073, + "time_per_iteration": 2.8728160858154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107952, + "balance_loss_mlp": 1.05267441, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.06075069456973031, + "language_loss": 0.83255166, + "learning_rate": 0.0006834712117920592, + "loss": 0.84334683, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.26855469, + "step": 2074, + "time_per_iteration": 2.6078460216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081959, + "balance_loss_mlp": 1.05458879, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.08105254072349301, + "language_loss": 0.85028476, + "learning_rate": 0.0006831813659387729, + "loss": 0.86110437, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.27416992, + "step": 2075, + "time_per_iteration": 2.5435502529144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080066, + "balance_loss_mlp": 1.05236197, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05543733258884828, + "language_loss": 0.84105802, + "learning_rate": 0.0006828914489681733, + "loss": 0.85185862, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.27758789, + "step": 2076, + "time_per_iteration": 2.716728687286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079901, + "balance_loss_mlp": 1.05186319, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05894989539880716, + "language_loss": 0.8515023, + "learning_rate": 0.0006826014609928162, + "loss": 0.86230129, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.28027344, + "step": 2077, + "time_per_iteration": 2.740797996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_mlp": 1.02490366, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.025465037646940157, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84235638, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.11328125, + "step": 2078, + "time_per_iteration": 4.832703590393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.05287147, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.11662193334808049, + "language_loss": 0.8017869, + "learning_rate": 0.0006820212724781896, + "loss": 0.81259406, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.27880859, + "step": 2079, + "time_per_iteration": 2.6742663383483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076717, + "balance_loss_mlp": 1.0488224, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.08177152300224107, + "language_loss": 0.83806193, + "learning_rate": 0.0006817310721641694, + "loss": 0.84882903, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.27905273, + "step": 2080, + "time_per_iteration": 2.8349008560180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076923, + "balance_loss_mlp": 1.04929078, + "epoch": 0.4003462870334744, + "flos": 520102356480.0, + "grad_norm": 0.06565277329590896, + "language_loss": 0.84214735, + "learning_rate": 0.00068144080129589, + "loss": 0.8529166, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.27685547, + "step": 2081, + "time_per_iteration": 2.6278159618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084354, + "balance_loss_mlp": 1.05710232, + "epoch": 0.400538668718738, + "flos": 492272902656.0, + "grad_norm": 0.05776018351639151, + "language_loss": 0.82856774, + "learning_rate": 0.0006811504599860441, + "loss": 0.83941126, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.27294922, + "step": 2082, + "time_per_iteration": 2.569265365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088899, + "balance_loss_mlp": 1.06140924, + "epoch": 0.40073105040400153, + "flos": 490083111936.0, + "grad_norm": 0.07401045054208001, + "language_loss": 0.85797036, + "learning_rate": 0.0006808600483473526, + "loss": 0.86885935, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.27490234, + "step": 2083, + "time_per_iteration": 2.8923354148864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.05170512, + "epoch": 0.4009234320892651, + "flos": 562088761344.0, + "grad_norm": 0.06499053200862517, + "language_loss": 0.86023808, + "learning_rate": 0.0006805695664925629, + "loss": 0.87103558, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.28027344, + "step": 2084, + "time_per_iteration": 2.8025314807891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.05461943, + "epoch": 0.40111581377452865, + "flos": 425786029056.0, + "grad_norm": 0.06817943175075042, + "language_loss": 0.8386181, + "learning_rate": 0.0006802790145344506, + "loss": 0.84944773, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.28344727, + "step": 2085, + "time_per_iteration": 2.5035839080810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075393, + "balance_loss_mlp": 1.04725957, + "epoch": 0.40130819545979224, + "flos": 612148907520.0, + "grad_norm": 0.06401081868364573, + "language_loss": 0.87169802, + "learning_rate": 0.0006799883925858176, + "loss": 0.88245201, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.28125, + "step": 2086, + "time_per_iteration": 2.8827152252197266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088527, + "balance_loss_mlp": 1.05989313, + "epoch": 0.40150057714505577, + "flos": 523179648000.0, + "grad_norm": 0.06559731004413262, + "language_loss": 0.85316324, + "learning_rate": 0.0006796977007594933, + "loss": 0.86404848, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.28637695, + "step": 2087, + "time_per_iteration": 2.5959601402282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094266, + "balance_loss_mlp": 1.06553721, + "epoch": 0.40169295883031936, + "flos": 561143033856.0, + "grad_norm": 0.12268552055269868, + "language_loss": 0.86342102, + "learning_rate": 0.0006794069391683345, + "loss": 0.87436372, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.28710938, + "step": 2088, + "time_per_iteration": 2.7393155097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089464, + "balance_loss_mlp": 1.06087732, + "epoch": 0.4018853405155829, + "flos": 518743401984.0, + "grad_norm": 0.0717880154934153, + "language_loss": 0.80560589, + "learning_rate": 0.0006791161079252248, + "loss": 0.81650054, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.28588867, + "step": 2089, + "time_per_iteration": 2.608919858932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_mlp": 1.06879497, + "epoch": 0.4020777222008465, + "flos": 525957193728.0, + "grad_norm": 0.06954460778471602, + "language_loss": 0.8248291, + "learning_rate": 0.0006788252071430747, + "loss": 0.83581454, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.29711914, + "step": 2090, + "time_per_iteration": 2.682352304458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103196, + "balance_loss_mlp": 1.07429934, + "epoch": 0.40227010388611006, + "flos": 525494504448.0, + "grad_norm": 0.07587120880411238, + "language_loss": 0.8680824, + "learning_rate": 0.0006785342369348222, + "loss": 0.87911433, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.28857422, + "step": 2091, + "time_per_iteration": 2.7333736419677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104599, + "balance_loss_mlp": 1.07579792, + "epoch": 0.4024624855713736, + "flos": 432074442240.0, + "grad_norm": 0.07069251800195664, + "language_loss": 0.7977879, + "learning_rate": 0.0006782431974134316, + "loss": 0.8088339, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.2878418, + "step": 2092, + "time_per_iteration": 2.541607141494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105121, + "balance_loss_mlp": 1.0768441, + "epoch": 0.4026548672566372, + "flos": 766304312832.0, + "grad_norm": 0.05426777537327344, + "language_loss": 0.89421535, + "learning_rate": 0.0006779520886918949, + "loss": 0.90526658, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.2824707, + "step": 2093, + "time_per_iteration": 3.035090684890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102418, + "balance_loss_mlp": 1.07378376, + "epoch": 0.4028472489419007, + "flos": 642636633600.0, + "grad_norm": 0.07593649947233896, + "language_loss": 0.81461406, + "learning_rate": 0.0006776609108832301, + "loss": 0.82563823, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.28637695, + "step": 2094, + "time_per_iteration": 2.8035519123077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102, + "balance_loss_mlp": 1.07398582, + "epoch": 0.4030396306271643, + "flos": 491593425408.0, + "grad_norm": 0.07164022458424311, + "language_loss": 0.85034972, + "learning_rate": 0.0006773696641004828, + "loss": 0.86136973, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.28027344, + "step": 2095, + "time_per_iteration": 2.6036603450775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100016, + "balance_loss_mlp": 1.07147717, + "epoch": 0.40323201231242783, + "flos": 901363133952.0, + "grad_norm": 0.07309254376996902, + "language_loss": 0.77576917, + "learning_rate": 0.0006770783484567247, + "loss": 0.78676933, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.28515625, + "step": 2096, + "time_per_iteration": 3.1005897521972656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.06557441, + "epoch": 0.4034243939976914, + "flos": 570267219456.0, + "grad_norm": 0.04872529153034484, + "language_loss": 0.86118937, + "learning_rate": 0.000676786964065055, + "loss": 0.87212431, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.27978516, + "step": 2097, + "time_per_iteration": 2.78965163230896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093986, + "balance_loss_mlp": 1.06680584, + "epoch": 0.403616775682955, + "flos": 507206845440.0, + "grad_norm": 0.06867709967223685, + "language_loss": 0.78839391, + "learning_rate": 0.0006764955110385986, + "loss": 0.79933375, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.2722168, + "step": 2098, + "time_per_iteration": 2.7579219341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090863, + "balance_loss_mlp": 1.06361151, + "epoch": 0.40380915736821854, + "flos": 519127515648.0, + "grad_norm": 0.0577520756279271, + "language_loss": 0.80600876, + "learning_rate": 0.0006762039894905083, + "loss": 0.81691736, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.27294922, + "step": 2099, + "time_per_iteration": 2.632434129714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05595064, + "epoch": 0.40400153905348213, + "flos": 441686048256.0, + "grad_norm": 0.06925599284799831, + "language_loss": 0.80233157, + "learning_rate": 0.000675912399533962, + "loss": 0.8131665, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.27563477, + "step": 2100, + "time_per_iteration": 2.521758556365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086411, + "balance_loss_mlp": 1.05947018, + "epoch": 0.40419392073874566, + "flos": 771961300992.0, + "grad_norm": 0.05734073179456058, + "language_loss": 0.84850854, + "learning_rate": 0.0006756207412821656, + "loss": 0.85937262, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.26977539, + "step": 2101, + "time_per_iteration": 3.043041944503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079398, + "balance_loss_mlp": 1.05245721, + "epoch": 0.40438630242400925, + "flos": 766215562752.0, + "grad_norm": 0.07220576126006613, + "language_loss": 0.80240154, + "learning_rate": 0.0006753290148483505, + "loss": 0.81319559, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.27001953, + "step": 2102, + "time_per_iteration": 3.0245606899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085045, + "balance_loss_mlp": 1.05726886, + "epoch": 0.4045786841092728, + "flos": 415013317632.0, + "grad_norm": 0.06170005058098184, + "language_loss": 0.78875476, + "learning_rate": 0.0006750372203457752, + "loss": 0.79960519, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.27832031, + "step": 2103, + "time_per_iteration": 2.484698534011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078758, + "balance_loss_mlp": 1.05131626, + "epoch": 0.40477106579453637, + "flos": 538941454848.0, + "grad_norm": 0.05090920908511917, + "language_loss": 0.86534655, + "learning_rate": 0.0006747453578877242, + "loss": 0.87613416, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.27490234, + "step": 2104, + "time_per_iteration": 2.69670033454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081019, + "balance_loss_mlp": 1.05281401, + "epoch": 0.4049634474797999, + "flos": 826358768640.0, + "grad_norm": 0.06546748387286302, + "language_loss": 0.8289392, + "learning_rate": 0.0006744534275875085, + "loss": 0.83974934, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.28222656, + "step": 2105, + "time_per_iteration": 2.9919168949127197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.05620074, + "epoch": 0.4051558291650635, + "flos": 572417722368.0, + "grad_norm": 0.0635527467859112, + "language_loss": 0.8582921, + "learning_rate": 0.0006741614295584657, + "loss": 0.86912322, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.26977539, + "step": 2106, + "time_per_iteration": 2.6488401889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107849, + "balance_loss_mlp": 1.05073833, + "epoch": 0.4053482108503271, + "flos": 731541874176.0, + "grad_norm": 0.057690605181557136, + "language_loss": 0.78413224, + "learning_rate": 0.0006738693639139595, + "loss": 0.79491717, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.27807617, + "step": 2107, + "time_per_iteration": 2.9652647972106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078123, + "balance_loss_mlp": 1.05015635, + "epoch": 0.4055405925355906, + "flos": 1212588292608.0, + "grad_norm": 0.05945372540383898, + "language_loss": 0.77655667, + "learning_rate": 0.0006735772307673796, + "loss": 0.78733784, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.27978516, + "step": 2108, + "time_per_iteration": 3.5789337158203125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079955, + "balance_loss_mlp": 1.05222702, + "epoch": 0.4057329742208542, + "flos": 715553104896.0, + "grad_norm": 0.05752735064114104, + "language_loss": 0.83347392, + "learning_rate": 0.0006732850302321421, + "loss": 0.84427351, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.27783203, + "step": 2109, + "time_per_iteration": 2.869591236114502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078846, + "balance_loss_mlp": 1.051476, + "epoch": 0.4059253559061177, + "flos": 564623377920.0, + "grad_norm": 0.06455621073123653, + "language_loss": 0.84327263, + "learning_rate": 0.00067299276242169, + "loss": 0.85406113, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.27441406, + "step": 2110, + "time_per_iteration": 2.673659563064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.07071877, + "epoch": 0.4061177375913813, + "flos": 1592886919680.0, + "grad_norm": 0.036236061846660186, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75464427, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.11523438, + "step": 2111, + "time_per_iteration": 4.886230230331421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082274, + "balance_loss_mlp": 1.05490351, + "epoch": 0.40631011927664484, + "flos": 615122892288.0, + "grad_norm": 0.05646906793429633, + "language_loss": 0.77664089, + "learning_rate": 0.0006724080254290395, + "loss": 0.78746361, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.27416992, + "step": 2112, + "time_per_iteration": 2.8506221771240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.04847741, + "epoch": 0.40650250096190843, + "flos": 557390647296.0, + "grad_norm": 0.06356712121797842, + "language_loss": 0.89422435, + "learning_rate": 0.0006721155564738566, + "loss": 0.90498972, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.28100586, + "step": 2113, + "time_per_iteration": 2.673015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_mlp": 1.02626586, + "epoch": 0.40669488264717196, + "flos": 1579301756928.0, + "grad_norm": 0.019828324636468348, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79660642, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.1171875, + "step": 2114, + "time_per_iteration": 5.003857851028442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080097, + "balance_loss_mlp": 1.0521065, + "epoch": 0.40688726433243555, + "flos": 507398902272.0, + "grad_norm": 0.07124796283110259, + "language_loss": 0.85397822, + "learning_rate": 0.0006715304182135078, + "loss": 0.86477917, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.2800293, + "step": 2115, + "time_per_iteration": 2.641721248626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108294, + "balance_loss_mlp": 1.05418694, + "epoch": 0.40707964601769914, + "flos": 588757109760.0, + "grad_norm": 0.08996962933736626, + "language_loss": 0.88862896, + "learning_rate": 0.0006712377491355127, + "loss": 0.89945835, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.28735352, + "step": 2116, + "time_per_iteration": 2.880159616470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077208, + "balance_loss_mlp": 1.04857373, + "epoch": 0.40727202770296267, + "flos": 580134901248.0, + "grad_norm": 0.046629180459365246, + "language_loss": 0.81631374, + "learning_rate": 0.0006709450135771274, + "loss": 0.82708585, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.28637695, + "step": 2117, + "time_per_iteration": 2.9391822814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107698, + "balance_loss_mlp": 1.04953849, + "epoch": 0.40746440938822626, + "flos": 503819633664.0, + "grad_norm": 0.05926883506924263, + "language_loss": 0.86382973, + "learning_rate": 0.0006706522116520023, + "loss": 0.87459958, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.27490234, + "step": 2118, + "time_per_iteration": 2.6404170989990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_mlp": 1.05072808, + "epoch": 0.4076567910734898, + "flos": 605323611648.0, + "grad_norm": 0.06371775766221305, + "language_loss": 0.82902479, + "learning_rate": 0.0006703593434738127, + "loss": 0.83981442, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.28222656, + "step": 2119, + "time_per_iteration": 2.6982903480529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080441, + "balance_loss_mlp": 1.05216455, + "epoch": 0.4078491727587534, + "flos": 479313372672.0, + "grad_norm": 0.05030428863920766, + "language_loss": 0.78137958, + "learning_rate": 0.0006700664091562604, + "loss": 0.792184, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.28271484, + "step": 2120, + "time_per_iteration": 2.5976343154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081224, + "balance_loss_mlp": 1.05259037, + "epoch": 0.4080415544440169, + "flos": 510126985728.0, + "grad_norm": 0.05481620044617693, + "language_loss": 0.85151196, + "learning_rate": 0.0006697734088130725, + "loss": 0.86232412, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.28637695, + "step": 2121, + "time_per_iteration": 2.613192558288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085667, + "balance_loss_mlp": 1.05665159, + "epoch": 0.4082339361292805, + "flos": 734318009856.0, + "grad_norm": 0.0674188074849357, + "language_loss": 0.85445356, + "learning_rate": 0.0006694803425580018, + "loss": 0.86531019, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.28955078, + "step": 2122, + "time_per_iteration": 2.9808695316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085803, + "balance_loss_mlp": 1.05585766, + "epoch": 0.4084263178145441, + "flos": 457239831552.0, + "grad_norm": 0.06189748292204317, + "language_loss": 0.8466748, + "learning_rate": 0.0006691872105048268, + "loss": 0.85753286, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.29907227, + "step": 2123, + "time_per_iteration": 2.5712099075317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_mlp": 1.05992901, + "epoch": 0.4086186994998076, + "flos": 562659139584.0, + "grad_norm": 0.06907127419859461, + "language_loss": 0.84616292, + "learning_rate": 0.0006688940127673513, + "loss": 0.85705543, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.29296875, + "step": 2124, + "time_per_iteration": 2.6865010261535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091737, + "balance_loss_mlp": 1.06181526, + "epoch": 0.4088110811850712, + "flos": 573364859904.0, + "grad_norm": 0.048409192362904495, + "language_loss": 0.85410631, + "learning_rate": 0.0006686007494594049, + "loss": 0.86502367, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.29882812, + "step": 2125, + "time_per_iteration": 2.8982856273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090612, + "balance_loss_mlp": 1.06085694, + "epoch": 0.40900346287033473, + "flos": 456702948864.0, + "grad_norm": 0.07961338986962259, + "language_loss": 0.80014485, + "learning_rate": 0.0006683074206948425, + "loss": 0.81105095, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.29736328, + "step": 2126, + "time_per_iteration": 2.489884853363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086751, + "balance_loss_mlp": 1.05649602, + "epoch": 0.4091958445555983, + "flos": 617097305088.0, + "grad_norm": 0.06572114620312723, + "language_loss": 0.81335235, + "learning_rate": 0.0006680140265875443, + "loss": 0.82421982, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.30200195, + "step": 2127, + "time_per_iteration": 2.8000454902648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05512488, + "epoch": 0.40938822624086185, + "flos": 472159217664.0, + "grad_norm": 0.054748250322007024, + "language_loss": 0.95437354, + "learning_rate": 0.0006677205672514162, + "loss": 0.9652164, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.29125977, + "step": 2128, + "time_per_iteration": 2.6153228282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05600977, + "epoch": 0.40958060792612544, + "flos": 569734718976.0, + "grad_norm": 0.05206451104952603, + "language_loss": 0.88892365, + "learning_rate": 0.000667427042800389, + "loss": 0.89978707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.30273438, + "step": 2129, + "time_per_iteration": 2.772545337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080649, + "balance_loss_mlp": 1.0521338, + "epoch": 0.40977298961138897, + "flos": 609065823744.0, + "grad_norm": 0.06928662998118869, + "language_loss": 0.82843542, + "learning_rate": 0.0006671334533484192, + "loss": 0.83924192, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.28515625, + "step": 2130, + "time_per_iteration": 2.7501790523529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077969, + "balance_loss_mlp": 1.04938281, + "epoch": 0.40996537129665256, + "flos": 581463332352.0, + "grad_norm": 0.051614263088568736, + "language_loss": 0.83230782, + "learning_rate": 0.0006668397990094881, + "loss": 0.84308755, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.28613281, + "step": 2131, + "time_per_iteration": 2.7121975421905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083028, + "balance_loss_mlp": 1.05370235, + "epoch": 0.41015775298191615, + "flos": 516296125440.0, + "grad_norm": 0.05828514658280376, + "language_loss": 0.84553468, + "learning_rate": 0.0006665460798976027, + "loss": 0.85636497, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.29296875, + "step": 2132, + "time_per_iteration": 2.7074639797210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082859, + "balance_loss_mlp": 1.05532122, + "epoch": 0.4103501346671797, + "flos": 510083315712.0, + "grad_norm": 0.06450815869750301, + "language_loss": 0.81324267, + "learning_rate": 0.0006662522961267947, + "loss": 0.82407123, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.27563477, + "step": 2133, + "time_per_iteration": 2.676886796951294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084376, + "balance_loss_mlp": 1.05555081, + "epoch": 0.41054251635244327, + "flos": 549459500544.0, + "grad_norm": 0.04843791936563358, + "language_loss": 0.87077558, + "learning_rate": 0.0006659584478111211, + "loss": 0.88161933, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.28833008, + "step": 2134, + "time_per_iteration": 2.8004117012023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06910408, + "epoch": 0.4107348980377068, + "flos": 839549643264.0, + "grad_norm": 0.07835760686868988, + "language_loss": 0.82880664, + "learning_rate": 0.000665664535064664, + "loss": 0.83977091, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.2734375, + "step": 2135, + "time_per_iteration": 3.034134864807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100622, + "balance_loss_mlp": 1.07278681, + "epoch": 0.4109272797229704, + "flos": 503445694464.0, + "grad_norm": 0.05799734322971953, + "language_loss": 0.82382762, + "learning_rate": 0.0006653705580015303, + "loss": 0.8348338, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.27819824, + "step": 2136, + "time_per_iteration": 2.719423770904541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105373, + "balance_loss_mlp": 1.07747769, + "epoch": 0.4111196614082339, + "flos": 610533877248.0, + "grad_norm": 0.05212184008762054, + "language_loss": 0.863967, + "learning_rate": 0.0006650765167358523, + "loss": 0.87502074, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.27905273, + "step": 2137, + "time_per_iteration": 2.7973241806030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110879, + "balance_loss_mlp": 1.08089471, + "epoch": 0.4113120430934975, + "flos": 452931623424.0, + "grad_norm": 0.07588683613844963, + "language_loss": 0.89871359, + "learning_rate": 0.0006647824113817864, + "loss": 0.90980148, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.27929688, + "step": 2138, + "time_per_iteration": 2.520531177520752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114294, + "balance_loss_mlp": 1.08768606, + "epoch": 0.41150442477876104, + "flos": 541324712448.0, + "grad_norm": 0.055552110514209885, + "language_loss": 0.81525648, + "learning_rate": 0.000664488242053515, + "loss": 0.82639945, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.26660156, + "step": 2139, + "time_per_iteration": 2.7204349040985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099437, + "balance_loss_mlp": 1.0722574, + "epoch": 0.4116968064640246, + "flos": 576017339904.0, + "grad_norm": 0.05646005524415558, + "language_loss": 0.83858913, + "learning_rate": 0.0006641940088652445, + "loss": 0.84958351, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.27246094, + "step": 2140, + "time_per_iteration": 2.748011827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.07521284, + "epoch": 0.4118891881492882, + "flos": 495857963520.0, + "grad_norm": 0.05970845599818087, + "language_loss": 0.81979877, + "learning_rate": 0.0006638997119312065, + "loss": 0.83081794, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.26757812, + "step": 2141, + "time_per_iteration": 2.723269462585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.07826746, + "epoch": 0.41208156983455174, + "flos": 1537604923392.0, + "grad_norm": 0.04300629071925061, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.76154923, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.13378906, + "step": 2142, + "time_per_iteration": 4.922248363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089912, + "balance_loss_mlp": 1.06239891, + "epoch": 0.41227395151981533, + "flos": 584697775104.0, + "grad_norm": 0.06629114096949819, + "language_loss": 0.8462221, + "learning_rate": 0.000663310927282877, + "loss": 0.85712123, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.27563477, + "step": 2143, + "time_per_iteration": 2.8463313579559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109146, + "balance_loss_mlp": 1.06413746, + "epoch": 0.41246633320507886, + "flos": 442685620224.0, + "grad_norm": 0.05519054049820913, + "language_loss": 0.86099815, + "learning_rate": 0.000663016439797172, + "loss": 0.87191272, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.2734375, + "step": 2144, + "time_per_iteration": 2.611057996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.05917096, + "epoch": 0.41265871489034245, + "flos": 579680976384.0, + "grad_norm": 0.07082455066013048, + "language_loss": 0.80582112, + "learning_rate": 0.0006627218890228724, + "loss": 0.81669062, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.27783203, + "step": 2145, + "time_per_iteration": 2.8047831058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087273, + "balance_loss_mlp": 1.05859172, + "epoch": 0.412851096575606, + "flos": 760906372608.0, + "grad_norm": 0.08398112437337095, + "language_loss": 0.83330071, + "learning_rate": 0.0006624272750743326, + "loss": 0.84417343, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.28637695, + "step": 2146, + "time_per_iteration": 2.9890313148498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081748, + "balance_loss_mlp": 1.05299461, + "epoch": 0.41304347826086957, + "flos": 555062644224.0, + "grad_norm": 0.12117217429962603, + "language_loss": 0.82466137, + "learning_rate": 0.0006621325980659322, + "loss": 0.83547878, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.2878418, + "step": 2147, + "time_per_iteration": 2.7945189476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.05475557, + "epoch": 0.41323585994613315, + "flos": 665418765312.0, + "grad_norm": 0.05729870278054163, + "language_loss": 0.81810451, + "learning_rate": 0.000661837858112075, + "loss": 0.82893538, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.28320312, + "step": 2148, + "time_per_iteration": 2.833590030670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05102634, + "epoch": 0.4134282416313967, + "flos": 548429405184.0, + "grad_norm": 0.05837233957282785, + "language_loss": 0.88857764, + "learning_rate": 0.0006615430553271888, + "loss": 0.89937091, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.28344727, + "step": 2149, + "time_per_iteration": 2.75384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04603195, + "epoch": 0.4136206233166603, + "flos": 645951062016.0, + "grad_norm": 0.06498878822354702, + "language_loss": 0.85069597, + "learning_rate": 0.0006612481898257264, + "loss": 0.86143911, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.28295898, + "step": 2150, + "time_per_iteration": 2.8471391201019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_mlp": 1.04901028, + "epoch": 0.4138130050019238, + "flos": 517103640576.0, + "grad_norm": 0.06146250241107021, + "language_loss": 0.85024071, + "learning_rate": 0.000660953261722165, + "loss": 0.8610152, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.28442383, + "step": 2151, + "time_per_iteration": 2.650024175643921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107559, + "balance_loss_mlp": 1.04643118, + "epoch": 0.4140053866871874, + "flos": 608977073664.0, + "grad_norm": 0.07635609550069686, + "language_loss": 0.82408941, + "learning_rate": 0.0006606582711310055, + "loss": 0.8348453, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.29150391, + "step": 2152, + "time_per_iteration": 2.707353353500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079486, + "balance_loss_mlp": 1.05068457, + "epoch": 0.4141977683724509, + "flos": 579493301760.0, + "grad_norm": 0.05643811624839042, + "language_loss": 0.83234471, + "learning_rate": 0.0006603632181667736, + "loss": 0.84313959, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.2878418, + "step": 2153, + "time_per_iteration": 2.6824803352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_mlp": 1.02085698, + "epoch": 0.4143901500577145, + "flos": 1306598777856.0, + "grad_norm": 0.02554992861291058, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79978293, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.14160156, + "step": 2154, + "time_per_iteration": 4.893488645553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075294, + "balance_loss_mlp": 1.04625416, + "epoch": 0.41458253174297804, + "flos": 459957740544.0, + "grad_norm": 0.06235301652291857, + "language_loss": 0.81530857, + "learning_rate": 0.0006597729255773153, + "loss": 0.82606155, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.2902832, + "step": 2155, + "time_per_iteration": 2.526531934738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084546, + "balance_loss_mlp": 1.05519629, + "epoch": 0.41477491342824163, + "flos": 553096995840.0, + "grad_norm": 0.06680223734216864, + "language_loss": 0.82554018, + "learning_rate": 0.0006594776861812608, + "loss": 0.83638561, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.29321289, + "step": 2156, + "time_per_iteration": 2.669290065765381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083913, + "balance_loss_mlp": 1.05525446, + "epoch": 0.4149672951135052, + "flos": 697444356096.0, + "grad_norm": 0.05896575190253656, + "language_loss": 0.8669672, + "learning_rate": 0.0006591823848704776, + "loss": 0.87780631, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.28613281, + "step": 2157, + "time_per_iteration": 2.9277596473693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081796, + "balance_loss_mlp": 1.05273294, + "epoch": 0.41515967679876875, + "flos": 565480355328.0, + "grad_norm": 0.07853922010281017, + "language_loss": 0.81488264, + "learning_rate": 0.0006588870217596117, + "loss": 0.82570058, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.29003906, + "step": 2158, + "time_per_iteration": 2.72590970993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107553, + "balance_loss_mlp": 1.04572749, + "epoch": 0.41535205848403234, + "flos": 500938781184.0, + "grad_norm": 0.06749140584983894, + "language_loss": 0.86219651, + "learning_rate": 0.0006585915969633334, + "loss": 0.87295187, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.29760742, + "step": 2159, + "time_per_iteration": 2.609668731689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068571, + "balance_loss_mlp": 1.03838706, + "epoch": 0.41554444016929587, + "flos": 607268911104.0, + "grad_norm": 0.0643598430263329, + "language_loss": 0.89336061, + "learning_rate": 0.0006582961105963366, + "loss": 0.90404636, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.30151367, + "step": 2160, + "time_per_iteration": 2.814122200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.04409909, + "epoch": 0.41573682185455946, + "flos": 528856985088.0, + "grad_norm": 0.0615363131016327, + "language_loss": 0.77864838, + "learning_rate": 0.0006580005627733395, + "loss": 0.78939116, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.30126953, + "step": 2161, + "time_per_iteration": 2.693002700805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03790569, + "epoch": 0.415929203539823, + "flos": 504686785536.0, + "grad_norm": 0.07091162327263066, + "language_loss": 0.81523043, + "learning_rate": 0.0006577049536090838, + "loss": 0.82590109, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.29125977, + "step": 2162, + "time_per_iteration": 2.6924569606781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.04039741, + "epoch": 0.4161215852250866, + "flos": 582467286528.0, + "grad_norm": 0.07952336976051765, + "language_loss": 0.85617888, + "learning_rate": 0.000657409283218335, + "loss": 0.86688089, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.29760742, + "step": 2163, + "time_per_iteration": 2.663069486618042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070228, + "balance_loss_mlp": 1.04075933, + "epoch": 0.4163139669103501, + "flos": 490432320000.0, + "grad_norm": 0.06199265882265987, + "language_loss": 0.81197548, + "learning_rate": 0.0006571135517158829, + "loss": 0.82267773, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.29394531, + "step": 2164, + "time_per_iteration": 2.6750965118408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043486, + "balance_loss_mlp": 1.03042102, + "epoch": 0.4165063485956137, + "flos": 1287445377024.0, + "grad_norm": 0.030179808177232596, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77807546, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.13085938, + "step": 2165, + "time_per_iteration": 4.7519471645355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071906, + "balance_loss_mlp": 1.0417223, + "epoch": 0.4166987302808773, + "flos": 495015542784.0, + "grad_norm": 0.06526247046532782, + "language_loss": 0.83270538, + "learning_rate": 0.0006565219058351444, + "loss": 0.84342444, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.30151367, + "step": 2166, + "time_per_iteration": 2.5784192085266113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070965, + "balance_loss_mlp": 1.04080534, + "epoch": 0.4168911119661408, + "flos": 463823608320.0, + "grad_norm": 0.06219532105294632, + "language_loss": 0.82938039, + "learning_rate": 0.0006562259916865553, + "loss": 0.84009004, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.30102539, + "step": 2167, + "time_per_iteration": 2.59431791305542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073926, + "balance_loss_mlp": 1.04369497, + "epoch": 0.4170834936514044, + "flos": 536499970560.0, + "grad_norm": 0.06573475594481314, + "language_loss": 0.7943427, + "learning_rate": 0.0006559300168856573, + "loss": 0.80508196, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.30175781, + "step": 2168, + "time_per_iteration": 2.727644443511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070483, + "balance_loss_mlp": 1.04046655, + "epoch": 0.41727587533666793, + "flos": 550418374656.0, + "grad_norm": 0.17889612534981147, + "language_loss": 0.85705924, + "learning_rate": 0.0006556339815473577, + "loss": 0.86776412, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.29980469, + "step": 2169, + "time_per_iteration": 2.6300487518310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072561, + "balance_loss_mlp": 1.04366493, + "epoch": 0.4174682570219315, + "flos": 630795949056.0, + "grad_norm": 0.053042429294564375, + "language_loss": 0.86056256, + "learning_rate": 0.000655337885786588, + "loss": 0.87128818, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.2890625, + "step": 2170, + "time_per_iteration": 2.8887124061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081102, + "balance_loss_mlp": 1.05139482, + "epoch": 0.41766063870719505, + "flos": 519501454848.0, + "grad_norm": 0.08227745310603136, + "language_loss": 0.84896123, + "learning_rate": 0.0006550417297183025, + "loss": 0.85977226, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.29663086, + "step": 2171, + "time_per_iteration": 2.6285011768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088317, + "balance_loss_mlp": 1.05894339, + "epoch": 0.41785302039245864, + "flos": 557656897536.0, + "grad_norm": 0.05761128029173598, + "language_loss": 0.81863701, + "learning_rate": 0.0006547455134574793, + "loss": 0.82952011, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.29321289, + "step": 2172, + "time_per_iteration": 2.7729623317718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089062, + "balance_loss_mlp": 1.06040442, + "epoch": 0.41804540207772223, + "flos": 788156683776.0, + "grad_norm": 0.06792239619892874, + "language_loss": 0.83893955, + "learning_rate": 0.0006544492371191198, + "loss": 0.84983015, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.28613281, + "step": 2173, + "time_per_iteration": 3.1256158351898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094435, + "balance_loss_mlp": 1.06477547, + "epoch": 0.41823778376298576, + "flos": 903944240640.0, + "grad_norm": 0.05504184984792058, + "language_loss": 0.83198339, + "learning_rate": 0.0006541529008182485, + "loss": 0.84292769, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.29638672, + "step": 2174, + "time_per_iteration": 3.207711696624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093664, + "balance_loss_mlp": 1.0648396, + "epoch": 0.41843016544824935, + "flos": 511308440064.0, + "grad_norm": 0.07199426026259947, + "language_loss": 0.87529659, + "learning_rate": 0.0006538565046699136, + "loss": 0.88623327, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.28808594, + "step": 2175, + "time_per_iteration": 2.5804800987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090181, + "balance_loss_mlp": 1.06207108, + "epoch": 0.4186225471335129, + "flos": 652774947840.0, + "grad_norm": 0.06367136059390696, + "language_loss": 0.80982441, + "learning_rate": 0.0006535600487891862, + "loss": 0.82072628, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.28149414, + "step": 2176, + "time_per_iteration": 2.7804555892944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087535, + "balance_loss_mlp": 1.05870986, + "epoch": 0.41881492881877647, + "flos": 568892298240.0, + "grad_norm": 0.05631892460787088, + "language_loss": 0.89099276, + "learning_rate": 0.0006532635332911603, + "loss": 0.9018681, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.28808594, + "step": 2177, + "time_per_iteration": 2.641392707824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083587, + "balance_loss_mlp": 1.05428553, + "epoch": 0.41900731050404, + "flos": 911478127104.0, + "grad_norm": 0.06086903625614387, + "language_loss": 0.80636132, + "learning_rate": 0.0006529669582909541, + "loss": 0.8171972, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.29296875, + "step": 2178, + "time_per_iteration": 3.2258243560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079831, + "balance_loss_mlp": 1.0508393, + "epoch": 0.4191996921893036, + "flos": 535498988544.0, + "grad_norm": 0.06798611784395944, + "language_loss": 0.85681045, + "learning_rate": 0.0006526703239037077, + "loss": 0.86760873, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.28955078, + "step": 2179, + "time_per_iteration": 2.66808819770813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_mlp": 1.0480361, + "epoch": 0.4193920738745671, + "flos": 582363979776.0, + "grad_norm": 0.06231650691948033, + "language_loss": 0.86236274, + "learning_rate": 0.0006523736302445851, + "loss": 0.87313515, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.29174805, + "step": 2180, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074827, + "balance_loss_mlp": 1.04490554, + "epoch": 0.4195844555598307, + "flos": 1335279720960.0, + "grad_norm": 0.05646655403971755, + "language_loss": 0.77122605, + "learning_rate": 0.0006520768774287728, + "loss": 0.78197432, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.29882812, + "step": 2181, + "time_per_iteration": 3.7851996421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077657, + "balance_loss_mlp": 1.04899919, + "epoch": 0.4197768372450943, + "flos": 598480786944.0, + "grad_norm": 0.05195874321999793, + "language_loss": 0.85622293, + "learning_rate": 0.0006517800655714806, + "loss": 0.86699945, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.28686523, + "step": 2182, + "time_per_iteration": 2.8000948429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083179, + "balance_loss_mlp": 1.05359161, + "epoch": 0.4199692189303578, + "flos": 734929085952.0, + "grad_norm": 0.06393427474455515, + "language_loss": 0.85246432, + "learning_rate": 0.0006514831947879407, + "loss": 0.86329615, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.2956543, + "step": 2183, + "time_per_iteration": 2.946345329284668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090824, + "balance_loss_mlp": 1.06164193, + "epoch": 0.4201616006156214, + "flos": 749854264320.0, + "grad_norm": 0.05990675678964555, + "language_loss": 0.78013611, + "learning_rate": 0.0006511862651934091, + "loss": 0.79104435, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.29174805, + "step": 2184, + "time_per_iteration": 3.043314218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087348, + "balance_loss_mlp": 1.05797458, + "epoch": 0.42035398230088494, + "flos": 546764912640.0, + "grad_norm": 0.05608517861748944, + "language_loss": 0.82263517, + "learning_rate": 0.0006508892769031638, + "loss": 0.83350861, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.29345703, + "step": 2185, + "time_per_iteration": 2.662071704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090134, + "balance_loss_mlp": 1.06052232, + "epoch": 0.42054636398614853, + "flos": 616628823552.0, + "grad_norm": 0.07931700187887496, + "language_loss": 0.86476076, + "learning_rate": 0.000650592230032506, + "loss": 0.87566209, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.2956543, + "step": 2186, + "time_per_iteration": 2.758989095687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094562, + "balance_loss_mlp": 1.06464052, + "epoch": 0.42073874567141206, + "flos": 640077285888.0, + "grad_norm": 0.06900651751722174, + "language_loss": 0.84912258, + "learning_rate": 0.0006502951246967595, + "loss": 0.8600682, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.29882812, + "step": 2187, + "time_per_iteration": 2.9305953979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091948, + "balance_loss_mlp": 1.06274199, + "epoch": 0.42093112735667565, + "flos": 493524168192.0, + "grad_norm": 0.061550495040686125, + "language_loss": 0.86992055, + "learning_rate": 0.0006499979610112706, + "loss": 0.88084006, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.29150391, + "step": 2188, + "time_per_iteration": 2.6826889514923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091259, + "balance_loss_mlp": 1.06205249, + "epoch": 0.4211235090419392, + "flos": 542097321984.0, + "grad_norm": 0.05090003048385584, + "language_loss": 0.84021527, + "learning_rate": 0.000649700739091409, + "loss": 0.85112786, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.29125977, + "step": 2189, + "time_per_iteration": 2.7169277667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_mlp": 1.04628468, + "epoch": 0.42131589072720277, + "flos": 1531342651392.0, + "grad_norm": 0.03212522571547254, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.74894285, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.1171875, + "step": 2190, + "time_per_iteration": 4.8044211864471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094227, + "balance_loss_mlp": 1.06645083, + "epoch": 0.42150827241246636, + "flos": 566583234048.0, + "grad_norm": 0.05853660814181512, + "language_loss": 0.85258055, + "learning_rate": 0.0006491061210101557, + "loss": 0.86352277, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.27832031, + "step": 2191, + "time_per_iteration": 2.6850759983062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093463, + "balance_loss_mlp": 1.06554449, + "epoch": 0.4217006540977299, + "flos": 707242226688.0, + "grad_norm": 0.05791259848064641, + "language_loss": 0.84111977, + "learning_rate": 0.0006488087250796157, + "loss": 0.85205436, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.27905273, + "step": 2192, + "time_per_iteration": 2.8877995014190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099215, + "balance_loss_mlp": 1.07148743, + "epoch": 0.4218930357829935, + "flos": 626975161344.0, + "grad_norm": 0.0649444731235166, + "language_loss": 0.81518376, + "learning_rate": 0.0006485112713764049, + "loss": 0.82617593, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.27734375, + "step": 2193, + "time_per_iteration": 2.910949468612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102268, + "balance_loss_mlp": 1.07523096, + "epoch": 0.422085417468257, + "flos": 460110509568.0, + "grad_norm": 0.07813881123096035, + "language_loss": 0.83433115, + "learning_rate": 0.0006482137600160051, + "loss": 0.84535384, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.27075195, + "step": 2194, + "time_per_iteration": 2.5086262226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096994, + "balance_loss_mlp": 1.06900394, + "epoch": 0.4222777991535206, + "flos": 473788804608.0, + "grad_norm": 0.07794223585413998, + "language_loss": 0.84987926, + "learning_rate": 0.0006479161911139206, + "loss": 0.86084926, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.2800293, + "step": 2195, + "time_per_iteration": 2.5875346660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109264, + "balance_loss_mlp": 1.06493604, + "epoch": 0.4224701808387841, + "flos": 470647494144.0, + "grad_norm": 0.07304716613473786, + "language_loss": 0.85472345, + "learning_rate": 0.0006476185647856778, + "loss": 0.86564982, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.27734375, + "step": 2196, + "time_per_iteration": 2.5596694946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083263, + "balance_loss_mlp": 1.05589223, + "epoch": 0.4226625625240477, + "flos": 677202633216.0, + "grad_norm": 0.0787732151202365, + "language_loss": 0.81599677, + "learning_rate": 0.0006473208811468255, + "loss": 0.82682943, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.27416992, + "step": 2197, + "time_per_iteration": 2.8756632804870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082675, + "balance_loss_mlp": 1.05518579, + "epoch": 0.4228549442093113, + "flos": 503268194304.0, + "grad_norm": 0.05582038208417147, + "language_loss": 0.84304923, + "learning_rate": 0.0006470231403129347, + "loss": 0.85387599, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.27490234, + "step": 2198, + "time_per_iteration": 2.6008548736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082097, + "balance_loss_mlp": 1.05444098, + "epoch": 0.42304732589457483, + "flos": 611543623680.0, + "grad_norm": 0.05486589756973033, + "language_loss": 0.81627637, + "learning_rate": 0.0006467253423995988, + "loss": 0.8270973, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.27685547, + "step": 2199, + "time_per_iteration": 2.8359298706054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085734, + "balance_loss_mlp": 1.05788624, + "epoch": 0.4232397075798384, + "flos": 515302345728.0, + "grad_norm": 0.06443704109820439, + "language_loss": 0.79415488, + "learning_rate": 0.000646427487522433, + "loss": 0.80501223, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.27880859, + "step": 2200, + "time_per_iteration": 2.6884772777557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089933, + "balance_loss_mlp": 1.06251502, + "epoch": 0.42343208926510195, + "flos": 589513752576.0, + "grad_norm": 0.06462007516901433, + "language_loss": 0.83460814, + "learning_rate": 0.0006461295757970749, + "loss": 0.8455075, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.27441406, + "step": 2201, + "time_per_iteration": 2.7960758209228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110052, + "balance_loss_mlp": 1.07140875, + "epoch": 0.42362447095036554, + "flos": 640342126080.0, + "grad_norm": 0.08363319364773283, + "language_loss": 0.81312859, + "learning_rate": 0.0006458316073391839, + "loss": 0.82413375, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.29101562, + "step": 2202, + "time_per_iteration": 2.853297472000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096557, + "balance_loss_mlp": 1.06830478, + "epoch": 0.42381685263562907, + "flos": 512421493248.0, + "grad_norm": 0.0711769658628502, + "language_loss": 0.87750852, + "learning_rate": 0.0006455335822644422, + "loss": 0.88847411, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.28271484, + "step": 2203, + "time_per_iteration": 2.6077048778533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110502, + "balance_loss_mlp": 1.07607579, + "epoch": 0.42400923432089266, + "flos": 546523393536.0, + "grad_norm": 0.061615225293076246, + "language_loss": 0.77729923, + "learning_rate": 0.0006452355006885527, + "loss": 0.78834939, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.28930664, + "step": 2204, + "time_per_iteration": 2.6517252922058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103628, + "balance_loss_mlp": 1.07442212, + "epoch": 0.4242016160061562, + "flos": 621872584704.0, + "grad_norm": 0.1220032897030914, + "language_loss": 0.86957574, + "learning_rate": 0.0006449373627272412, + "loss": 0.88061202, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.29199219, + "step": 2205, + "time_per_iteration": 2.7004148960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093739, + "balance_loss_mlp": 1.06515288, + "epoch": 0.4243939976914198, + "flos": 571649495040.0, + "grad_norm": 0.07705045910796138, + "language_loss": 0.82556224, + "learning_rate": 0.0006446391684962553, + "loss": 0.83649963, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.28588867, + "step": 2206, + "time_per_iteration": 2.6505441665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083745, + "balance_loss_mlp": 1.05558801, + "epoch": 0.42458637937668336, + "flos": 448509934080.0, + "grad_norm": 0.0589868983385633, + "language_loss": 0.82958955, + "learning_rate": 0.000644340918111364, + "loss": 0.84042698, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.28149414, + "step": 2207, + "time_per_iteration": 2.6410183906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079008, + "balance_loss_mlp": 1.05011129, + "epoch": 0.4247787610619469, + "flos": 435176464896.0, + "grad_norm": 0.05680611388250626, + "language_loss": 0.84805965, + "learning_rate": 0.0006440426116883585, + "loss": 0.8588497, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.28857422, + "step": 2208, + "time_per_iteration": 2.5708625316619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074083, + "balance_loss_mlp": 1.04478097, + "epoch": 0.4249711427472105, + "flos": 495818675712.0, + "grad_norm": 0.06224422813064936, + "language_loss": 0.86093891, + "learning_rate": 0.0006437442493430519, + "loss": 0.87167978, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.29248047, + "step": 2209, + "time_per_iteration": 2.70894718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074378, + "balance_loss_mlp": 1.04481411, + "epoch": 0.425163524432474, + "flos": 655498649088.0, + "grad_norm": 0.07482969618411565, + "language_loss": 0.86115217, + "learning_rate": 0.000643445831191278, + "loss": 0.87189603, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.29492188, + "step": 2210, + "time_per_iteration": 2.924381971359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076507, + "balance_loss_mlp": 1.0465858, + "epoch": 0.4253559061177376, + "flos": 650317496832.0, + "grad_norm": 0.07331466132736943, + "language_loss": 0.81421846, + "learning_rate": 0.0006431473573488937, + "loss": 0.82498354, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.29882812, + "step": 2211, + "time_per_iteration": 2.7787976264953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073299, + "balance_loss_mlp": 1.04380631, + "epoch": 0.42554828780300114, + "flos": 553894336512.0, + "grad_norm": 0.07883329281510759, + "language_loss": 0.84917492, + "learning_rate": 0.0006428488279317765, + "loss": 0.85990787, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.29443359, + "step": 2212, + "time_per_iteration": 2.6664369106292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070733, + "balance_loss_mlp": 1.04052496, + "epoch": 0.4257406694882647, + "flos": 514154386944.0, + "grad_norm": 0.06306745469338368, + "language_loss": 0.87706983, + "learning_rate": 0.0006425502430558259, + "loss": 0.88777709, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.30151367, + "step": 2213, + "time_per_iteration": 2.6229989528656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04106641, + "epoch": 0.42593305117352825, + "flos": 515380921344.0, + "grad_norm": 0.0655798606724697, + "language_loss": 0.84705913, + "learning_rate": 0.0006422516028369628, + "loss": 0.8577702, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.30004883, + "step": 2214, + "time_per_iteration": 2.69012451171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072564, + "balance_loss_mlp": 1.04197454, + "epoch": 0.42612543285879184, + "flos": 587766302208.0, + "grad_norm": 0.08051577462794157, + "language_loss": 0.83543354, + "learning_rate": 0.0006419529073911296, + "loss": 0.84615922, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.30541992, + "step": 2215, + "time_per_iteration": 2.873396873474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070818, + "balance_loss_mlp": 1.03987157, + "epoch": 0.42631781454405543, + "flos": 635153619456.0, + "grad_norm": 0.05918367623789858, + "language_loss": 0.85362011, + "learning_rate": 0.0006416541568342901, + "loss": 0.86432827, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.30908203, + "step": 2216, + "time_per_iteration": 2.870213508605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071511, + "balance_loss_mlp": 1.04161358, + "epoch": 0.42651019622931896, + "flos": 540891136512.0, + "grad_norm": 0.06028802274016953, + "language_loss": 0.8413707, + "learning_rate": 0.0006413553512824297, + "loss": 0.85208583, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.29858398, + "step": 2217, + "time_per_iteration": 2.7570102214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066011, + "balance_loss_mlp": 1.03599358, + "epoch": 0.42670257791458255, + "flos": 557892624384.0, + "grad_norm": 0.06136950817587928, + "language_loss": 0.8441695, + "learning_rate": 0.0006410564908515549, + "loss": 0.85482961, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.29980469, + "step": 2218, + "time_per_iteration": 2.634636878967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4268949595998461, + "flos": 621025781760.0, + "grad_norm": 0.05945328981992575, + "language_loss": 0.85267186, + "learning_rate": 0.0006407575756576935, + "loss": 0.8633939, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.30957031, + "step": 2219, + "time_per_iteration": 2.7264437675476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076309, + "balance_loss_mlp": 1.04512346, + "epoch": 0.42708734128510967, + "flos": 537646519296.0, + "grad_norm": 0.08352776642532155, + "language_loss": 0.87413085, + "learning_rate": 0.0006404586058168951, + "loss": 0.88489389, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.31152344, + "step": 2220, + "time_per_iteration": 2.740231513977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070252, + "balance_loss_mlp": 1.03906727, + "epoch": 0.4272797229703732, + "flos": 502617830400.0, + "grad_norm": 0.06337599132559579, + "language_loss": 0.86675316, + "learning_rate": 0.0006401595814452296, + "loss": 0.87745565, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.31152344, + "step": 2221, + "time_per_iteration": 2.595133066177368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04316878, + "epoch": 0.4274721046556368, + "flos": 492208883712.0, + "grad_norm": 0.05998559409639075, + "language_loss": 0.80837309, + "learning_rate": 0.000639860502658789, + "loss": 0.81910712, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.30224609, + "step": 2222, + "time_per_iteration": 2.6363143920898438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078431, + "balance_loss_mlp": 1.04805684, + "epoch": 0.4276644863409004, + "flos": 568094957568.0, + "grad_norm": 0.051235249414951084, + "language_loss": 0.85047621, + "learning_rate": 0.0006395613695736853, + "loss": 0.86126053, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.3034668, + "step": 2223, + "time_per_iteration": 2.719651222229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088807, + "balance_loss_mlp": 1.0574553, + "epoch": 0.4278568680261639, + "flos": 607155429888.0, + "grad_norm": 0.14370485886555942, + "language_loss": 0.82013905, + "learning_rate": 0.0006392621823060529, + "loss": 0.83102709, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.31347656, + "step": 2224, + "time_per_iteration": 2.707019805908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.04968464, + "epoch": 0.4280492497114275, + "flos": 560265707520.0, + "grad_norm": 0.06727581417341866, + "language_loss": 0.84405053, + "learning_rate": 0.0006389629409720465, + "loss": 0.85485303, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.30541992, + "step": 2225, + "time_per_iteration": 2.6877145767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074744, + "balance_loss_mlp": 1.04415512, + "epoch": 0.428241631396691, + "flos": 720334176768.0, + "grad_norm": 0.06967859590672425, + "language_loss": 0.88595277, + "learning_rate": 0.0006386636456878417, + "loss": 0.89670026, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.30566406, + "step": 2226, + "time_per_iteration": 2.87302827835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.04344106, + "epoch": 0.4284340130819546, + "flos": 429243052032.0, + "grad_norm": 0.07126154474787791, + "language_loss": 0.92022073, + "learning_rate": 0.0006383642965696353, + "loss": 0.93095744, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.30175781, + "step": 2227, + "time_per_iteration": 2.4469897747039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.04492915, + "epoch": 0.42862639476721814, + "flos": 524732069376.0, + "grad_norm": 0.06843530557124561, + "language_loss": 0.82703793, + "learning_rate": 0.000638064893733645, + "loss": 0.83779144, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.30371094, + "step": 2228, + "time_per_iteration": 2.7728607654571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071747, + "balance_loss_mlp": 1.04256451, + "epoch": 0.42881877645248173, + "flos": 465089430528.0, + "grad_norm": 0.058089035035371744, + "language_loss": 0.89580554, + "learning_rate": 0.000637765437296109, + "loss": 0.90652299, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.29199219, + "step": 2229, + "time_per_iteration": 2.634521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04252505, + "epoch": 0.42901115813774526, + "flos": 560034362880.0, + "grad_norm": 0.07373798457938027, + "language_loss": 0.85480672, + "learning_rate": 0.000637465927373287, + "loss": 0.86553335, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.30126953, + "step": 2230, + "time_per_iteration": 2.6294057369232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082832, + "balance_loss_mlp": 1.05276728, + "epoch": 0.42920353982300885, + "flos": 561186703872.0, + "grad_norm": 0.08134114280474665, + "language_loss": 0.79152465, + "learning_rate": 0.000637166364081459, + "loss": 0.80235291, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.30004883, + "step": 2231, + "time_per_iteration": 2.651043176651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107691, + "balance_loss_mlp": 1.04837155, + "epoch": 0.42939592150827244, + "flos": 555982230528.0, + "grad_norm": 0.0656552791827552, + "language_loss": 0.83965945, + "learning_rate": 0.0006368667475369256, + "loss": 0.85042852, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.28515625, + "step": 2232, + "time_per_iteration": 2.749769687652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072336, + "balance_loss_mlp": 1.05898428, + "epoch": 0.42958830319353597, + "flos": 1520796902400.0, + "grad_norm": 0.038311067760931045, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79600114, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.13378906, + "step": 2233, + "time_per_iteration": 4.919846773147583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_mlp": 1.04044378, + "epoch": 0.42978068487879956, + "flos": 1495052522496.0, + "grad_norm": 0.026216416348918452, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79949123, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.1328125, + "step": 2234, + "time_per_iteration": 4.814115285873413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109183, + "balance_loss_mlp": 1.06281483, + "epoch": 0.4299730665640631, + "flos": 546725624832.0, + "grad_norm": 0.052673535005773216, + "language_loss": 0.85474288, + "learning_rate": 0.0006359675795504112, + "loss": 0.86566114, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.29003906, + "step": 2235, + "time_per_iteration": 2.7002832889556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097467, + "balance_loss_mlp": 1.07021558, + "epoch": 0.4301654482493267, + "flos": 1128839473152.0, + "grad_norm": 0.08125384058814748, + "language_loss": 0.74334383, + "learning_rate": 0.0006356677511584775, + "loss": 0.75431848, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.27294922, + "step": 2236, + "time_per_iteration": 3.472095012664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096497, + "balance_loss_mlp": 1.06938839, + "epoch": 0.4303578299345902, + "flos": 495502963200.0, + "grad_norm": 0.06719636161557083, + "language_loss": 0.85933757, + "learning_rate": 0.0006353678700956511, + "loss": 0.8703025, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.27148438, + "step": 2237, + "time_per_iteration": 2.6188535690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089994, + "balance_loss_mlp": 1.06288612, + "epoch": 0.4305502116198538, + "flos": 615472100352.0, + "grad_norm": 0.09054713742221257, + "language_loss": 0.83597302, + "learning_rate": 0.0006350679364783569, + "loss": 0.84687304, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.27172852, + "step": 2238, + "time_per_iteration": 2.7403035163879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093799, + "balance_loss_mlp": 1.0661664, + "epoch": 0.4307425933051173, + "flos": 558995503104.0, + "grad_norm": 0.06694912929746479, + "language_loss": 0.85728157, + "learning_rate": 0.0006347679504230393, + "loss": 0.86821961, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.27661133, + "step": 2239, + "time_per_iteration": 2.652348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087161, + "balance_loss_mlp": 1.05974269, + "epoch": 0.4309349749903809, + "flos": 971755163136.0, + "grad_norm": 0.056527008755361936, + "language_loss": 0.75895661, + "learning_rate": 0.0006344679120461632, + "loss": 0.7698282, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.27416992, + "step": 2240, + "time_per_iteration": 3.334127187728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091078, + "balance_loss_mlp": 1.06435084, + "epoch": 0.4311273566756445, + "flos": 541663746048.0, + "grad_norm": 0.1917370324350853, + "language_loss": 0.80061769, + "learning_rate": 0.0006341678214642134, + "loss": 0.81152856, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.26782227, + "step": 2241, + "time_per_iteration": 2.6100823879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087616, + "balance_loss_mlp": 1.06103277, + "epoch": 0.43131973836090803, + "flos": 761316627456.0, + "grad_norm": 0.06088249389193946, + "language_loss": 0.82893783, + "learning_rate": 0.0006338676787936963, + "loss": 0.83981395, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.26635742, + "step": 2242, + "time_per_iteration": 3.077916383743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109796, + "balance_loss_mlp": 1.07142353, + "epoch": 0.4315121200461716, + "flos": 554263893504.0, + "grad_norm": 0.060062439107852666, + "language_loss": 0.8377043, + "learning_rate": 0.0006335674841511367, + "loss": 0.84868383, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.26586914, + "step": 2243, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_mlp": 1.05415499, + "epoch": 0.43170450173143515, + "flos": 1484499419136.0, + "grad_norm": 0.03077915513708162, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80247629, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.12255859, + "step": 2244, + "time_per_iteration": 5.000265121459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060995, + "balance_loss_mlp": 1.04878819, + "epoch": 0.43189688341669874, + "flos": 1472897433600.0, + "grad_norm": 0.03064763148494063, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.7842654, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.12207031, + "step": 2245, + "time_per_iteration": 4.9160850048065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093506, + "balance_loss_mlp": 1.06594431, + "epoch": 0.43208926510196227, + "flos": 492677365248.0, + "grad_norm": 0.06803490831657065, + "language_loss": 0.82597309, + "learning_rate": 0.0006326665895567652, + "loss": 0.83690816, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.2755127, + "step": 2246, + "time_per_iteration": 2.6449503898620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084672, + "balance_loss_mlp": 1.05649078, + "epoch": 0.43228164678722586, + "flos": 519969936384.0, + "grad_norm": 0.07553831830843152, + "language_loss": 0.87537026, + "learning_rate": 0.0006323661881916976, + "loss": 0.88621694, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.28173828, + "step": 2247, + "time_per_iteration": 2.699899911880493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088894, + "balance_loss_mlp": 1.05983043, + "epoch": 0.4324740284724894, + "flos": 795722655744.0, + "grad_norm": 0.05605692822142187, + "language_loss": 0.80999863, + "learning_rate": 0.0006320657354375179, + "loss": 0.82088757, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.2902832, + "step": 2248, + "time_per_iteration": 2.9737963676452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082946, + "balance_loss_mlp": 1.05374026, + "epoch": 0.432666410157753, + "flos": 481917800448.0, + "grad_norm": 0.1777496827938913, + "language_loss": 0.87151104, + "learning_rate": 0.0006317652314108726, + "loss": 0.88234049, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.29150391, + "step": 2249, + "time_per_iteration": 2.5640759468078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076296, + "balance_loss_mlp": 1.04782867, + "epoch": 0.43285879184301657, + "flos": 499963940352.0, + "grad_norm": 0.059764616303547735, + "language_loss": 0.91275859, + "learning_rate": 0.0006314646762284277, + "loss": 0.92352152, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.28442383, + "step": 2250, + "time_per_iteration": 2.6878976821899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056511, + "balance_loss_mlp": 1.04401791, + "epoch": 0.4330511735282801, + "flos": 1509615346176.0, + "grad_norm": 0.026928771485436313, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76482344, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.125, + "step": 2251, + "time_per_iteration": 4.839360475540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079121, + "balance_loss_mlp": 1.04931927, + "epoch": 0.4332435552135437, + "flos": 699270382080.0, + "grad_norm": 0.05685438588579276, + "language_loss": 0.77368456, + "learning_rate": 0.0006308634128629022, + "loss": 0.78447574, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.29785156, + "step": 2252, + "time_per_iteration": 2.895348072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083422, + "balance_loss_mlp": 1.05426395, + "epoch": 0.4334359368988072, + "flos": 591995934720.0, + "grad_norm": 0.07214959985253801, + "language_loss": 0.87411779, + "learning_rate": 0.0006305627049132531, + "loss": 0.88495201, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.29125977, + "step": 2253, + "time_per_iteration": 2.8069100379943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.05440617, + "epoch": 0.4336283185840708, + "flos": 842440670208.0, + "grad_norm": 0.059293193490882155, + "language_loss": 0.85926008, + "learning_rate": 0.0006302619462746662, + "loss": 0.87009549, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.29101562, + "step": 2254, + "time_per_iteration": 3.1606533527374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080071, + "balance_loss_mlp": 1.05193734, + "epoch": 0.43382070026933434, + "flos": 625974179328.0, + "grad_norm": 0.05505451724174187, + "language_loss": 0.89697909, + "learning_rate": 0.0006299611370639069, + "loss": 0.90777981, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.28149414, + "step": 2255, + "time_per_iteration": 2.734578847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082321, + "balance_loss_mlp": 1.05368638, + "epoch": 0.4340130819545979, + "flos": 590837801472.0, + "grad_norm": 0.06498253441528982, + "language_loss": 0.79077351, + "learning_rate": 0.0006296602773977593, + "loss": 0.80159676, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.28637695, + "step": 2256, + "time_per_iteration": 2.7210190296173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_mlp": 1.0577755, + "epoch": 0.4342054636398615, + "flos": 490624376832.0, + "grad_norm": 0.06552918038966793, + "language_loss": 0.87430996, + "learning_rate": 0.0006293593673930277, + "loss": 0.88517857, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.2902832, + "step": 2257, + "time_per_iteration": 2.6526098251342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087005, + "balance_loss_mlp": 1.05851448, + "epoch": 0.43439784532512504, + "flos": 698679654912.0, + "grad_norm": 0.06677812911461618, + "language_loss": 0.78416431, + "learning_rate": 0.0006290584071665358, + "loss": 0.79503441, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.28491211, + "step": 2258, + "time_per_iteration": 2.915259838104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086079, + "balance_loss_mlp": 1.0575645, + "epoch": 0.43459022701038863, + "flos": 485581436928.0, + "grad_norm": 0.06990053073214272, + "language_loss": 0.81982124, + "learning_rate": 0.0006287573968351266, + "loss": 0.83068204, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.28515625, + "step": 2259, + "time_per_iteration": 2.5836570262908936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082362, + "balance_loss_mlp": 1.05432403, + "epoch": 0.43478260869565216, + "flos": 642818515968.0, + "grad_norm": 0.06494033905479386, + "language_loss": 0.82220829, + "learning_rate": 0.0006284563365156626, + "loss": 0.83303189, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.28076172, + "step": 2260, + "time_per_iteration": 2.815223217010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084558, + "balance_loss_mlp": 1.05620956, + "epoch": 0.43497499038091575, + "flos": 425870396928.0, + "grad_norm": 0.07047722124208498, + "language_loss": 0.87564874, + "learning_rate": 0.0006281552263250261, + "loss": 0.88649434, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.28344727, + "step": 2261, + "time_per_iteration": 2.4715116024017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106204, + "balance_loss_mlp": 1.04964256, + "epoch": 0.4351673720661793, + "flos": 1537594748928.0, + "grad_norm": 0.023387556142435376, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81753576, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.12402344, + "step": 2262, + "time_per_iteration": 4.811767101287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084425, + "balance_loss_mlp": 1.05641103, + "epoch": 0.43535975375144287, + "flos": 748828551168.0, + "grad_norm": 0.062970719214795, + "language_loss": 0.81474411, + "learning_rate": 0.0006275528567978593, + "loss": 0.82558835, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.28051758, + "step": 2263, + "time_per_iteration": 2.9182233810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096573, + "balance_loss_mlp": 1.06877375, + "epoch": 0.4355521354367064, + "flos": 860914593792.0, + "grad_norm": 0.06472545743832298, + "language_loss": 0.82352197, + "learning_rate": 0.0006272515976951898, + "loss": 0.83448768, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.27832031, + "step": 2264, + "time_per_iteration": 3.137770175933838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097325, + "balance_loss_mlp": 1.06852436, + "epoch": 0.43574451712197, + "flos": 734200146432.0, + "grad_norm": 0.055887733519337984, + "language_loss": 0.79332447, + "learning_rate": 0.0006269502891890687, + "loss": 0.8042978, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.28759766, + "step": 2265, + "time_per_iteration": 2.9932398796081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093111, + "balance_loss_mlp": 1.06526363, + "epoch": 0.4359368988072336, + "flos": 570296332800.0, + "grad_norm": 0.06217907852457908, + "language_loss": 0.87852293, + "learning_rate": 0.0006266489313964743, + "loss": 0.88945401, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.27880859, + "step": 2266, + "time_per_iteration": 2.720874547958374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.06338787, + "epoch": 0.4361292804924971, + "flos": 555244526592.0, + "grad_norm": 0.05517220152754215, + "language_loss": 0.85363281, + "learning_rate": 0.0006263475244344041, + "loss": 0.86454159, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.27514648, + "step": 2267, + "time_per_iteration": 2.8508987426757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089804, + "balance_loss_mlp": 1.06178975, + "epoch": 0.4363216621777607, + "flos": 557021090304.0, + "grad_norm": 0.061658084399303315, + "language_loss": 0.84817886, + "learning_rate": 0.0006260460684198746, + "loss": 0.85907692, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.28027344, + "step": 2268, + "time_per_iteration": 2.6972851753234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091639, + "balance_loss_mlp": 1.06395864, + "epoch": 0.4365140438630242, + "flos": 477979149312.0, + "grad_norm": 0.07163404822705746, + "language_loss": 0.84593827, + "learning_rate": 0.0006257445634699213, + "loss": 0.85685468, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.27734375, + "step": 2269, + "time_per_iteration": 2.562509298324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083836, + "balance_loss_mlp": 1.05565524, + "epoch": 0.4367064255482878, + "flos": 578646498816.0, + "grad_norm": 0.07106993063326117, + "language_loss": 0.82829607, + "learning_rate": 0.0006254430097015993, + "loss": 0.8391344, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.28222656, + "step": 2270, + "time_per_iteration": 2.6713523864746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054528, + "balance_loss_mlp": 1.04203498, + "epoch": 0.43689880723355135, + "flos": 1458117669888.0, + "grad_norm": 0.029151500829202304, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.77533615, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.125, + "step": 2271, + "time_per_iteration": 4.761755466461182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086484, + "balance_loss_mlp": 1.05801725, + "epoch": 0.43709118891881493, + "flos": 667295663616.0, + "grad_norm": 0.05590316940209524, + "language_loss": 0.85155964, + "learning_rate": 0.0006248397561781609, + "loss": 0.86242455, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.28491211, + "step": 2272, + "time_per_iteration": 2.8541359901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091334, + "balance_loss_mlp": 1.06246173, + "epoch": 0.43728357060407846, + "flos": 544612999680.0, + "grad_norm": 0.07335127222093174, + "language_loss": 0.8601104, + "learning_rate": 0.0006245380566572482, + "loss": 0.87102377, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.28857422, + "step": 2273, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090326, + "balance_loss_mlp": 1.06200182, + "epoch": 0.43747595228934205, + "flos": 746504930304.0, + "grad_norm": 0.06592567136619501, + "language_loss": 0.76039565, + "learning_rate": 0.0006242363087863744, + "loss": 0.77129889, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.28344727, + "step": 2274, + "time_per_iteration": 2.9512767791748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089474, + "balance_loss_mlp": 1.06129336, + "epoch": 0.43766833397460564, + "flos": 631060789248.0, + "grad_norm": 0.07045204489750885, + "language_loss": 0.86392975, + "learning_rate": 0.0006239345126826878, + "loss": 0.87482452, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.28198242, + "step": 2275, + "time_per_iteration": 2.818574905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081719, + "balance_loss_mlp": 1.05236995, + "epoch": 0.43786071565986917, + "flos": 530709152256.0, + "grad_norm": 0.06271142699552738, + "language_loss": 0.8405596, + "learning_rate": 0.0006236326684633561, + "loss": 0.85137677, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.29296875, + "step": 2276, + "time_per_iteration": 2.8501060009002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088499, + "balance_loss_mlp": 1.05972195, + "epoch": 0.43805309734513276, + "flos": 538295473152.0, + "grad_norm": 0.08224081940065299, + "language_loss": 0.75057948, + "learning_rate": 0.0006233307762455658, + "loss": 0.76146448, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.28735352, + "step": 2277, + "time_per_iteration": 2.6692187786102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079787, + "balance_loss_mlp": 1.05098617, + "epoch": 0.4382454790303963, + "flos": 864188324352.0, + "grad_norm": 0.1351794781054828, + "language_loss": 0.83103114, + "learning_rate": 0.0006230288361465216, + "loss": 0.84182906, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.2878418, + "step": 2278, + "time_per_iteration": 3.0566518306732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081672, + "balance_loss_mlp": 1.05389631, + "epoch": 0.4384378607156599, + "flos": 765175292928.0, + "grad_norm": 0.0635725084076576, + "language_loss": 0.85047072, + "learning_rate": 0.0006227268482834473, + "loss": 0.86128747, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.27783203, + "step": 2279, + "time_per_iteration": 2.890195608139038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086149, + "balance_loss_mlp": 1.05811095, + "epoch": 0.4386302424009234, + "flos": 668260329984.0, + "grad_norm": 0.06574285370830908, + "language_loss": 0.87371957, + "learning_rate": 0.000622424812773585, + "loss": 0.88458109, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.28076172, + "step": 2280, + "time_per_iteration": 2.820857524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108464, + "balance_loss_mlp": 1.05698299, + "epoch": 0.438822624086187, + "flos": 484941247488.0, + "grad_norm": 0.08150674529849485, + "language_loss": 0.80050623, + "learning_rate": 0.000622122729734195, + "loss": 0.81135261, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.27685547, + "step": 2281, + "time_per_iteration": 2.5578882694244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090722, + "balance_loss_mlp": 1.06320858, + "epoch": 0.4390150057714506, + "flos": 498959986176.0, + "grad_norm": 0.05652917217777931, + "language_loss": 0.87423271, + "learning_rate": 0.0006218205992825566, + "loss": 0.88513994, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.27539062, + "step": 2282, + "time_per_iteration": 2.6367194652557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05989254, + "epoch": 0.4392073874567141, + "flos": 557937704448.0, + "grad_norm": 0.06387466426791162, + "language_loss": 0.81580615, + "learning_rate": 0.0006215184215359671, + "loss": 0.82668239, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.27758789, + "step": 2283, + "time_per_iteration": 2.7550642490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109022, + "balance_loss_mlp": 1.06254005, + "epoch": 0.4393997691419777, + "flos": 605028248064.0, + "grad_norm": 0.06853375358246538, + "language_loss": 0.86762869, + "learning_rate": 0.0006212161966117425, + "loss": 0.87853086, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.27709961, + "step": 2284, + "time_per_iteration": 2.7315139770507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093132, + "balance_loss_mlp": 1.06492722, + "epoch": 0.43959215082724123, + "flos": 803812363776.0, + "grad_norm": 0.06833018750237568, + "language_loss": 0.81347001, + "learning_rate": 0.0006209139246272164, + "loss": 0.82440132, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.28222656, + "step": 2285, + "time_per_iteration": 2.997727394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085597, + "balance_loss_mlp": 1.0573678, + "epoch": 0.4397845325125048, + "flos": 487403080704.0, + "grad_norm": 0.0627571888999813, + "language_loss": 0.81454128, + "learning_rate": 0.0006206116056997421, + "loss": 0.82539719, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.28271484, + "step": 2286, + "time_per_iteration": 2.5523786544799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092851, + "balance_loss_mlp": 1.06512272, + "epoch": 0.43997691419776835, + "flos": 480569020416.0, + "grad_norm": 0.0569936252584843, + "language_loss": 0.82580131, + "learning_rate": 0.0006203092399466892, + "loss": 0.83672982, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.27783203, + "step": 2287, + "time_per_iteration": 2.5256903171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080971, + "balance_loss_mlp": 1.05317175, + "epoch": 0.44016929588303194, + "flos": 482873702400.0, + "grad_norm": 0.052620788715243595, + "language_loss": 0.85130596, + "learning_rate": 0.0006200068274854473, + "loss": 0.86211562, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.27832031, + "step": 2288, + "time_per_iteration": 2.6666431427001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089786, + "balance_loss_mlp": 1.06108057, + "epoch": 0.4403616775682955, + "flos": 571562155008.0, + "grad_norm": 0.05493211856459023, + "language_loss": 0.85969126, + "learning_rate": 0.0006197043684334229, + "loss": 0.87058908, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.28686523, + "step": 2289, + "time_per_iteration": 2.7558815479278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093604, + "balance_loss_mlp": 1.0652802, + "epoch": 0.44055405925355906, + "flos": 630563194368.0, + "grad_norm": 0.06713172204070075, + "language_loss": 0.7966578, + "learning_rate": 0.0006194018629080411, + "loss": 0.80759388, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.28344727, + "step": 2290, + "time_per_iteration": 2.7641310691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095567, + "balance_loss_mlp": 1.06721866, + "epoch": 0.44074644093882265, + "flos": 536523291648.0, + "grad_norm": 0.06308142018549157, + "language_loss": 0.81759441, + "learning_rate": 0.0006190993110267451, + "loss": 0.8285501, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.28393555, + "step": 2291, + "time_per_iteration": 2.759451389312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087327, + "balance_loss_mlp": 1.05959892, + "epoch": 0.4409388226240862, + "flos": 462995744256.0, + "grad_norm": 0.0663089643389441, + "language_loss": 0.84395695, + "learning_rate": 0.0006187967129069958, + "loss": 0.85483021, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.27758789, + "step": 2292, + "time_per_iteration": 2.5458216667175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108768, + "balance_loss_mlp": 1.06011844, + "epoch": 0.44113120430934977, + "flos": 565717492224.0, + "grad_norm": 0.05260179709926624, + "language_loss": 0.8707509, + "learning_rate": 0.0006184940686662722, + "loss": 0.88162768, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.27612305, + "step": 2293, + "time_per_iteration": 2.7694880962371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05494058, + "epoch": 0.4413235859946133, + "flos": 543313681920.0, + "grad_norm": 0.055518519655343164, + "language_loss": 0.90020764, + "learning_rate": 0.0006181913784220714, + "loss": 0.91103435, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.27758789, + "step": 2294, + "time_per_iteration": 2.6642205715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_mlp": 1.03542924, + "epoch": 0.4415159676798769, + "flos": 1569016465920.0, + "grad_norm": 0.024577707308588242, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81601226, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.12011719, + "step": 2295, + "time_per_iteration": 4.874637842178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084239, + "balance_loss_mlp": 1.05665421, + "epoch": 0.4417083493651404, + "flos": 658423171584.0, + "grad_norm": 0.06513424306559527, + "language_loss": 0.79833972, + "learning_rate": 0.0006175858603933146, + "loss": 0.80918217, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.27612305, + "step": 2296, + "time_per_iteration": 2.9130241870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084408, + "balance_loss_mlp": 1.05665636, + "epoch": 0.441900731050404, + "flos": 740119002624.0, + "grad_norm": 0.06251545633736988, + "language_loss": 0.80774343, + "learning_rate": 0.0006172830328438416, + "loss": 0.81858754, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.27783203, + "step": 2297, + "time_per_iteration": 2.953983783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_mlp": 1.05460715, + "epoch": 0.44209311273566754, + "flos": 539153860608.0, + "grad_norm": 0.057534365085963636, + "language_loss": 0.86889625, + "learning_rate": 0.0006169801597610572, + "loss": 0.87972271, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.28051758, + "step": 2298, + "time_per_iteration": 2.7841529846191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087234, + "balance_loss_mlp": 1.05986333, + "epoch": 0.4422854944209311, + "flos": 621335702016.0, + "grad_norm": 0.0717755554401909, + "language_loss": 0.89631718, + "learning_rate": 0.0006166772412625469, + "loss": 0.90718955, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.27416992, + "step": 2299, + "time_per_iteration": 2.7750232219696045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087463, + "balance_loss_mlp": 1.05983019, + "epoch": 0.4424778761061947, + "flos": 658516303872.0, + "grad_norm": 0.06473860012868299, + "language_loss": 0.81551421, + "learning_rate": 0.0006163742774659141, + "loss": 0.82638884, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.27661133, + "step": 2300, + "time_per_iteration": 2.8384482860565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092146, + "balance_loss_mlp": 1.06446528, + "epoch": 0.44267025779145824, + "flos": 568297188864.0, + "grad_norm": 0.0850959758091444, + "language_loss": 0.85627389, + "learning_rate": 0.0006160712684887801, + "loss": 0.86719531, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.27709961, + "step": 2301, + "time_per_iteration": 2.7603278160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084767, + "balance_loss_mlp": 1.05813527, + "epoch": 0.44286263947672183, + "flos": 496469039616.0, + "grad_norm": 0.053898588417471735, + "language_loss": 0.81867981, + "learning_rate": 0.0006157682144487832, + "loss": 0.82952744, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.2668457, + "step": 2302, + "time_per_iteration": 2.7585275173187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090771, + "balance_loss_mlp": 1.06347191, + "epoch": 0.44305502116198536, + "flos": 609096347136.0, + "grad_norm": 0.05970343490953875, + "language_loss": 0.82821, + "learning_rate": 0.0006154651154635793, + "loss": 0.83911771, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.2734375, + "step": 2303, + "time_per_iteration": 4.252831697463989 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097367, + "balance_loss_mlp": 1.07040215, + "epoch": 0.44324740284724895, + "flos": 470558744064.0, + "grad_norm": 0.05697892496442649, + "language_loss": 0.8468399, + "learning_rate": 0.0006151619716508421, + "loss": 0.85781354, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.27026367, + "step": 2304, + "time_per_iteration": 2.5882937908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_mlp": 1.07442617, + "epoch": 0.4434397845325125, + "flos": 578454441984.0, + "grad_norm": 0.06572201075979017, + "language_loss": 0.86751652, + "learning_rate": 0.0006148587831282625, + "loss": 0.87853855, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.27807617, + "step": 2305, + "time_per_iteration": 2.6605563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_mlp": 1.04066956, + "epoch": 0.44363216621777607, + "flos": 1495765343232.0, + "grad_norm": 0.01894914693526954, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.802288, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.12060547, + "step": 2306, + "time_per_iteration": 4.910472631454468 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102108, + "balance_loss_mlp": 1.07342601, + "epoch": 0.44382454790303966, + "flos": 477082884096.0, + "grad_norm": 0.06457533715620843, + "language_loss": 0.87372738, + "learning_rate": 0.0006142522724244255, + "loss": 0.88474846, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.28686523, + "step": 2307, + "time_per_iteration": 2.5184578895568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_mlp": 1.03508484, + "epoch": 0.4440169295883032, + "flos": 1543321548288.0, + "grad_norm": 0.015440750347127817, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.7753191, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.12109375, + "step": 2308, + "time_per_iteration": 4.880531549453735 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104605, + "balance_loss_mlp": 1.07668638, + "epoch": 0.4442093112735668, + "flos": 590789749248.0, + "grad_norm": 0.0625118895390298, + "language_loss": 0.77304882, + "learning_rate": 0.000613645584293942, + "loss": 0.78409487, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.27954102, + "step": 2309, + "time_per_iteration": 2.888929605484009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_mlp": 1.07522511, + "epoch": 0.4444016929588303, + "flos": 530009326080.0, + "grad_norm": 0.05626484670913178, + "language_loss": 0.82863319, + "learning_rate": 0.0006133421739881185, + "loss": 0.83965981, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.27441406, + "step": 2310, + "time_per_iteration": 2.6770823001861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098373, + "balance_loss_mlp": 1.06966734, + "epoch": 0.4445940746440939, + "flos": 619947634176.0, + "grad_norm": 0.09114290921538859, + "language_loss": 0.82713985, + "learning_rate": 0.0006130387196789605, + "loss": 0.83812356, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.28686523, + "step": 2311, + "time_per_iteration": 2.7363758087158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110237, + "balance_loss_mlp": 1.07309198, + "epoch": 0.4447864563293574, + "flos": 628782248448.0, + "grad_norm": 0.05056880651601303, + "language_loss": 0.84359384, + "learning_rate": 0.0006127352214842795, + "loss": 0.85461748, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.29272461, + "step": 2312, + "time_per_iteration": 3.0277068614959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095618, + "balance_loss_mlp": 1.06688845, + "epoch": 0.444978838014621, + "flos": 650548841472.0, + "grad_norm": 0.06767648502511064, + "language_loss": 0.85424733, + "learning_rate": 0.0006124316795219041, + "loss": 0.8652035, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.28710938, + "step": 2313, + "time_per_iteration": 2.7824032306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_mlp": 1.05996561, + "epoch": 0.44517121969988455, + "flos": 612153289728.0, + "grad_norm": 0.06031488841862457, + "language_loss": 0.8232829, + "learning_rate": 0.0006121280939096794, + "loss": 0.83416176, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.27905273, + "step": 2314, + "time_per_iteration": 2.7414164543151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087621, + "balance_loss_mlp": 1.05901051, + "epoch": 0.44536360138514813, + "flos": 488491402752.0, + "grad_norm": 0.056993316738708576, + "language_loss": 0.8765316, + "learning_rate": 0.000611824464765468, + "loss": 0.88740778, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.28613281, + "step": 2315, + "time_per_iteration": 2.5894503593444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020326, + "balance_loss_mlp": 1.00830936, + "epoch": 0.4455559830704117, + "flos": 1515425255424.0, + "grad_norm": 0.018109298143921163, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79615265, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.12011719, + "step": 2316, + "time_per_iteration": 4.654959201812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081165, + "balance_loss_mlp": 1.05322254, + "epoch": 0.44574836475567525, + "flos": 615314949120.0, + "grad_norm": 0.05658516719934989, + "language_loss": 0.85440743, + "learning_rate": 0.000611217076352619, + "loss": 0.86521906, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.27978516, + "step": 2317, + "time_per_iteration": 2.8710198402404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086137, + "balance_loss_mlp": 1.05862343, + "epoch": 0.44594074644093884, + "flos": 506070471168.0, + "grad_norm": 0.062250172980488426, + "language_loss": 0.82876933, + "learning_rate": 0.0006109133173197905, + "loss": 0.8396306, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.27539062, + "step": 2318, + "time_per_iteration": 2.7298824787139893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05986071, + "epoch": 0.44613312812620237, + "flos": 726647321088.0, + "grad_norm": 0.0706297628000491, + "language_loss": 0.85633492, + "learning_rate": 0.0006106095152265935, + "loss": 0.8672179, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.28466797, + "step": 2319, + "time_per_iteration": 2.8895695209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108895, + "balance_loss_mlp": 1.06086433, + "epoch": 0.44632550981146596, + "flos": 635419869696.0, + "grad_norm": 0.04876785494191262, + "language_loss": 0.84747481, + "learning_rate": 0.0006103056701909739, + "loss": 0.85836434, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.28125, + "step": 2320, + "time_per_iteration": 2.9117228984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108858, + "balance_loss_mlp": 1.05935025, + "epoch": 0.4465178914967295, + "flos": 826690447872.0, + "grad_norm": 0.06765559983355682, + "language_loss": 0.82841372, + "learning_rate": 0.0006100017823308956, + "loss": 0.8392995, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.29199219, + "step": 2321, + "time_per_iteration": 3.19189453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095794, + "balance_loss_mlp": 1.06618226, + "epoch": 0.4467102731819931, + "flos": 665532246528.0, + "grad_norm": 0.07493928757304909, + "language_loss": 0.796121, + "learning_rate": 0.0006096978517643377, + "loss": 0.80707896, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.29589844, + "step": 2322, + "time_per_iteration": 2.7803642749786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088319, + "balance_loss_mlp": 1.05825448, + "epoch": 0.4469026548672566, + "flos": 512692125696.0, + "grad_norm": 0.05979787162997368, + "language_loss": 0.83128643, + "learning_rate": 0.0006093938786092968, + "loss": 0.84216964, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.30029297, + "step": 2323, + "time_per_iteration": 2.6324985027313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084456, + "balance_loss_mlp": 1.05403399, + "epoch": 0.4470950365525202, + "flos": 683774825472.0, + "grad_norm": 0.0696967897289199, + "language_loss": 0.89752465, + "learning_rate": 0.0006090898629837857, + "loss": 0.90836924, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.30395508, + "step": 2324, + "time_per_iteration": 2.833986282348633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081367, + "balance_loss_mlp": 1.05073011, + "epoch": 0.4472874182377838, + "flos": 627018831360.0, + "grad_norm": 0.05715713314103227, + "language_loss": 0.87296605, + "learning_rate": 0.0006087858050058337, + "loss": 0.88377976, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.3059082, + "step": 2325, + "time_per_iteration": 2.8220982551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082075, + "balance_loss_mlp": 1.05084252, + "epoch": 0.4474797999230473, + "flos": 546946795008.0, + "grad_norm": 0.06405768205874736, + "language_loss": 0.82704103, + "learning_rate": 0.0006084817047934866, + "loss": 0.83786178, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.31225586, + "step": 2326, + "time_per_iteration": 2.6844918727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077775, + "balance_loss_mlp": 1.04635119, + "epoch": 0.4476721816083109, + "flos": 455585513472.0, + "grad_norm": 0.06718825176833507, + "language_loss": 0.89515507, + "learning_rate": 0.0006081775624648066, + "loss": 0.90593284, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.31396484, + "step": 2327, + "time_per_iteration": 2.5115904808044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.04937041, + "epoch": 0.44786456329357444, + "flos": 481273228800.0, + "grad_norm": 0.06388622036462539, + "language_loss": 0.82659936, + "learning_rate": 0.0006078733781378721, + "loss": 0.83740276, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.30957031, + "step": 2328, + "time_per_iteration": 2.5578174591064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107055, + "balance_loss_mlp": 1.04003251, + "epoch": 0.448056944978838, + "flos": 551822409216.0, + "grad_norm": 0.05909371510774122, + "language_loss": 0.82426572, + "learning_rate": 0.0006075691519307781, + "loss": 0.83497119, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.3046875, + "step": 2329, + "time_per_iteration": 2.9271137714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04025745, + "epoch": 0.44824932666410156, + "flos": 550571143680.0, + "grad_norm": 0.0899878860138525, + "language_loss": 0.81604564, + "learning_rate": 0.0006072648839616356, + "loss": 0.8267594, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.31103516, + "step": 2330, + "time_per_iteration": 2.642164945602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069213, + "balance_loss_mlp": 1.03805184, + "epoch": 0.44844170834936514, + "flos": 988161541632.0, + "grad_norm": 0.05660389796161562, + "language_loss": 0.82544589, + "learning_rate": 0.0006069605743485718, + "loss": 0.83613807, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.3112793, + "step": 2331, + "time_per_iteration": 3.3559155464172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_mlp": 1.04945791, + "epoch": 0.44863409003462873, + "flos": 591040032768.0, + "grad_norm": 0.06166347857347268, + "language_loss": 0.83528912, + "learning_rate": 0.0006066562232097303, + "loss": 0.84607553, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.29125977, + "step": 2332, + "time_per_iteration": 2.7531135082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107678, + "balance_loss_mlp": 1.0468111, + "epoch": 0.44882647171989226, + "flos": 724313525760.0, + "grad_norm": 0.0526351904833897, + "language_loss": 0.86127633, + "learning_rate": 0.0006063518306632708, + "loss": 0.87204421, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.29907227, + "step": 2333, + "time_per_iteration": 2.957057476043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080344, + "balance_loss_mlp": 1.05044627, + "epoch": 0.44901885340515585, + "flos": 534662360064.0, + "grad_norm": 0.07121293699241546, + "language_loss": 0.82098341, + "learning_rate": 0.0006060473968273688, + "loss": 0.83178687, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.29882812, + "step": 2334, + "time_per_iteration": 2.687427043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050724, + "balance_loss_mlp": 1.03756309, + "epoch": 0.4492112350904194, + "flos": 1554456462336.0, + "grad_norm": 0.03308553204338399, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78930265, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.13183594, + "step": 2335, + "time_per_iteration": 4.873494625091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_mlp": 1.01476717, + "epoch": 0.44940361677568297, + "flos": 1522525413888.0, + "grad_norm": 0.020404135430742085, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82032573, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.12597656, + "step": 2336, + "time_per_iteration": 4.8493242263793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091959, + "balance_loss_mlp": 1.06327689, + "epoch": 0.4495959984609465, + "flos": 382289310720.0, + "grad_norm": 0.08823378464345366, + "language_loss": 0.8815735, + "learning_rate": 0.0006051338487650047, + "loss": 0.89249313, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.28686523, + "step": 2337, + "time_per_iteration": 2.4994585514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094323, + "balance_loss_mlp": 1.06595135, + "epoch": 0.4497883801462101, + "flos": 497630145024.0, + "grad_norm": 0.058014135330130424, + "language_loss": 0.82146972, + "learning_rate": 0.0006048292509534095, + "loss": 0.83241296, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.28344727, + "step": 2338, + "time_per_iteration": 2.6184592247009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_mlp": 1.07211113, + "epoch": 0.4499807618314736, + "flos": 614166990336.0, + "grad_norm": 0.056454767026620875, + "language_loss": 0.77617335, + "learning_rate": 0.0006045246124434895, + "loss": 0.78716958, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.27539062, + "step": 2339, + "time_per_iteration": 2.7225115299224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100795, + "balance_loss_mlp": 1.07309031, + "epoch": 0.4501731435167372, + "flos": 1005122331648.0, + "grad_norm": 0.09896135571333878, + "language_loss": 0.86173731, + "learning_rate": 0.0006042199333535162, + "loss": 0.87274528, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.27709961, + "step": 2340, + "time_per_iteration": 3.274585008621216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104864, + "balance_loss_mlp": 1.07768369, + "epoch": 0.4503655252020008, + "flos": 820519898112.0, + "grad_norm": 0.05749680267159243, + "language_loss": 0.84251344, + "learning_rate": 0.0006039152138017763, + "loss": 0.85356206, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.27246094, + "step": 2341, + "time_per_iteration": 3.060763359069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105096, + "balance_loss_mlp": 1.07796395, + "epoch": 0.4505579068872643, + "flos": 486113937408.0, + "grad_norm": 0.056134576893582644, + "language_loss": 0.83558077, + "learning_rate": 0.0006036104539065726, + "loss": 0.84663171, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.27172852, + "step": 2342, + "time_per_iteration": 2.7406816482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108201, + "balance_loss_mlp": 1.08054459, + "epoch": 0.4507502885725279, + "flos": 884421282816.0, + "grad_norm": 0.061859527889038764, + "language_loss": 0.84472108, + "learning_rate": 0.000603305653786223, + "loss": 0.85580313, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.27685547, + "step": 2343, + "time_per_iteration": 3.197312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100913, + "balance_loss_mlp": 1.07354283, + "epoch": 0.45094267025779144, + "flos": 578070328320.0, + "grad_norm": 0.054371913691722666, + "language_loss": 0.83979696, + "learning_rate": 0.0006030008135590622, + "loss": 0.85080612, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.27416992, + "step": 2344, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097762, + "balance_loss_mlp": 1.07062995, + "epoch": 0.45113505194305503, + "flos": 525124947456.0, + "grad_norm": 0.05301123134364682, + "language_loss": 0.8020395, + "learning_rate": 0.0006026959333434387, + "loss": 0.81301707, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.27172852, + "step": 2345, + "time_per_iteration": 2.7582781314849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.0720278, + "epoch": 0.45132743362831856, + "flos": 501791376384.0, + "grad_norm": 0.056237590740745906, + "language_loss": 0.77273649, + "learning_rate": 0.0006023910132577181, + "loss": 0.78373116, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.2746582, + "step": 2346, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086046, + "balance_loss_mlp": 1.05915189, + "epoch": 0.45151981531358215, + "flos": 431690328576.0, + "grad_norm": 0.061957652789735564, + "language_loss": 0.84835315, + "learning_rate": 0.0006020860534202806, + "loss": 0.85921359, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.26953125, + "step": 2347, + "time_per_iteration": 2.5046098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010926, + "balance_loss_mlp": 1.06475294, + "epoch": 0.4517121969988457, + "flos": 711826859520.0, + "grad_norm": 0.05205934628014934, + "language_loss": 0.80817962, + "learning_rate": 0.0006017810539495224, + "loss": 0.81910563, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.27905273, + "step": 2348, + "time_per_iteration": 2.9269816875457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.06642056, + "epoch": 0.45190457868410927, + "flos": 579197938176.0, + "grad_norm": 0.0701488599790333, + "language_loss": 0.82789373, + "learning_rate": 0.0006014760149638547, + "loss": 0.83883661, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.27880859, + "step": 2349, + "time_per_iteration": 2.725395441055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.05837011, + "epoch": 0.45209696036937286, + "flos": 482415395328.0, + "grad_norm": 0.05676126010630497, + "language_loss": 0.88258755, + "learning_rate": 0.000601170936581704, + "loss": 0.89344376, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.27270508, + "step": 2350, + "time_per_iteration": 2.5604915618896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088839, + "balance_loss_mlp": 1.06101537, + "epoch": 0.4522893420546364, + "flos": 539945409024.0, + "grad_norm": 0.07551987134141444, + "language_loss": 0.84626472, + "learning_rate": 0.0006008658189215121, + "loss": 0.85715318, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.27832031, + "step": 2351, + "time_per_iteration": 2.6299045085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100125, + "balance_loss_mlp": 1.07158601, + "epoch": 0.4524817237399, + "flos": 496423959552.0, + "grad_norm": 0.07553479525673996, + "language_loss": 0.79898262, + "learning_rate": 0.0006005606621017366, + "loss": 0.80998385, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.28540039, + "step": 2352, + "time_per_iteration": 2.58725905418396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095957, + "balance_loss_mlp": 1.06732249, + "epoch": 0.4526741054251635, + "flos": 652229300736.0, + "grad_norm": 0.05769795994016392, + "language_loss": 0.8022939, + "learning_rate": 0.0006002554662408496, + "loss": 0.81325346, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.28637695, + "step": 2353, + "time_per_iteration": 2.9054527282714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089231, + "balance_loss_mlp": 1.06078792, + "epoch": 0.4528664871104271, + "flos": 570674654208.0, + "grad_norm": 0.07238968138349489, + "language_loss": 0.91292691, + "learning_rate": 0.0005999502314573388, + "loss": 0.92381918, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.28393555, + "step": 2354, + "time_per_iteration": 2.6389734745025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085728, + "balance_loss_mlp": 1.05656958, + "epoch": 0.45305886879569063, + "flos": 458480922624.0, + "grad_norm": 0.0719451372015111, + "language_loss": 0.86045247, + "learning_rate": 0.0005996449578697066, + "loss": 0.87130976, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.29174805, + "step": 2355, + "time_per_iteration": 2.6851072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_mlp": 1.06634867, + "epoch": 0.4532512504809542, + "flos": 504922512384.0, + "grad_norm": 0.05612545408526447, + "language_loss": 0.81111002, + "learning_rate": 0.0005993396455964709, + "loss": 0.82205319, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.2800293, + "step": 2356, + "time_per_iteration": 2.6760780811309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095343, + "balance_loss_mlp": 1.06754375, + "epoch": 0.4534436321662178, + "flos": 581940578304.0, + "grad_norm": 0.05702970789361519, + "language_loss": 0.81782162, + "learning_rate": 0.0005990342947561647, + "loss": 0.82877505, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.27856445, + "step": 2357, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108513, + "balance_loss_mlp": 1.07949746, + "epoch": 0.45363601385148133, + "flos": 549458090496.0, + "grad_norm": 0.06168719534303639, + "language_loss": 0.77822679, + "learning_rate": 0.0005987289054673351, + "loss": 0.78931195, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.28979492, + "step": 2358, + "time_per_iteration": 2.6254196166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191784, + "balance_loss_mlp": 1.18038785, + "epoch": 0.4538283955367449, + "flos": 1473754411008.0, + "grad_norm": 0.06020491976481073, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77767521, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.11376953, + "step": 2359, + "time_per_iteration": 4.803730010986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112502, + "balance_loss_mlp": 1.08300948, + "epoch": 0.45402077722200845, + "flos": 584441699328.0, + "grad_norm": 0.06904936924963041, + "language_loss": 0.90802431, + "learning_rate": 0.0005981180120183722, + "loss": 0.91914928, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.29443359, + "step": 2360, + "time_per_iteration": 2.672501564025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115124, + "balance_loss_mlp": 1.08560812, + "epoch": 0.45421315890727204, + "flos": 531462822912.0, + "grad_norm": 0.18994365983189826, + "language_loss": 0.85107553, + "learning_rate": 0.0005978125080954089, + "loss": 0.86222672, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.29492188, + "step": 2361, + "time_per_iteration": 2.7426631450653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111841, + "balance_loss_mlp": 1.0814904, + "epoch": 0.4544055405925356, + "flos": 784890307584.0, + "grad_norm": 0.07946717837388541, + "language_loss": 0.76933616, + "learning_rate": 0.000597506966198262, + "loss": 0.78045452, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.30297852, + "step": 2362, + "time_per_iteration": 2.9498252868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113617, + "balance_loss_mlp": 1.08438706, + "epoch": 0.45459792227779916, + "flos": 517950443520.0, + "grad_norm": 0.08220053414262748, + "language_loss": 0.83964276, + "learning_rate": 0.0005972013864455536, + "loss": 0.85077894, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.29199219, + "step": 2363, + "time_per_iteration": 2.623084545135498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112763, + "balance_loss_mlp": 1.0844152, + "epoch": 0.4547903039630627, + "flos": 537306075648.0, + "grad_norm": 0.07689777421943021, + "language_loss": 0.84891784, + "learning_rate": 0.0005968957689559203, + "loss": 0.86004549, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.28369141, + "step": 2364, + "time_per_iteration": 4.15172266960144 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103083, + "balance_loss_mlp": 1.07492638, + "epoch": 0.4549826856483263, + "flos": 528423409152.0, + "grad_norm": 0.0791653109712497, + "language_loss": 0.88481373, + "learning_rate": 0.0005965901138480131, + "loss": 0.89584458, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.28173828, + "step": 2365, + "time_per_iteration": 2.5800631046295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097109, + "balance_loss_mlp": 1.06840384, + "epoch": 0.45517506733358987, + "flos": 520649413632.0, + "grad_norm": 0.06578783357270249, + "language_loss": 0.87197572, + "learning_rate": 0.0005962844212404982, + "loss": 0.88294685, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.28686523, + "step": 2366, + "time_per_iteration": 2.6940040588378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091654, + "balance_loss_mlp": 1.06344962, + "epoch": 0.4553674490188534, + "flos": 450814616064.0, + "grad_norm": 0.05998271622094208, + "language_loss": 0.86890531, + "learning_rate": 0.0005959786912520558, + "loss": 0.87982178, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.2824707, + "step": 2367, + "time_per_iteration": 2.65913987159729 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096727, + "balance_loss_mlp": 1.06854558, + "epoch": 0.455559830704117, + "flos": 546308015616.0, + "grad_norm": 0.04792571197867491, + "language_loss": 0.83765805, + "learning_rate": 0.0005956729240013806, + "loss": 0.8486253, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.28173828, + "step": 2368, + "time_per_iteration": 2.8546009063720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108894, + "balance_loss_mlp": 1.08035553, + "epoch": 0.4557522123893805, + "flos": 583491589632.0, + "grad_norm": 0.054790339147135006, + "language_loss": 0.91898453, + "learning_rate": 0.0005953671196071824, + "loss": 0.93007344, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.28540039, + "step": 2369, + "time_per_iteration": 2.7034096717834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115288, + "balance_loss_mlp": 1.08767939, + "epoch": 0.4559445940746441, + "flos": 526149250560.0, + "grad_norm": 0.05736115779957956, + "language_loss": 0.79610699, + "learning_rate": 0.0005950612781881846, + "loss": 0.8072598, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.27636719, + "step": 2370, + "time_per_iteration": 2.707674264907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124856, + "balance_loss_mlp": 1.09662771, + "epoch": 0.45613697575990764, + "flos": 651810281472.0, + "grad_norm": 0.08139155344435882, + "language_loss": 0.75630575, + "learning_rate": 0.0005947553998631259, + "loss": 0.76755428, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.2824707, + "step": 2371, + "time_per_iteration": 2.8811731338500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125619, + "balance_loss_mlp": 1.09770048, + "epoch": 0.4563293574451712, + "flos": 866744699904.0, + "grad_norm": 0.07117752980456016, + "language_loss": 0.79090154, + "learning_rate": 0.000594449484750758, + "loss": 0.80215776, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.27905273, + "step": 2372, + "time_per_iteration": 3.1549901962280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116997, + "balance_loss_mlp": 1.08807683, + "epoch": 0.45652173913043476, + "flos": 497817819648.0, + "grad_norm": 0.061849801440599636, + "language_loss": 0.82697588, + "learning_rate": 0.0005941435329698484, + "loss": 0.83814585, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.2890625, + "step": 2373, + "time_per_iteration": 2.6593072414398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118584, + "balance_loss_mlp": 1.09054554, + "epoch": 0.45671412081569834, + "flos": 560581420032.0, + "grad_norm": 0.06278217801879041, + "language_loss": 0.83130741, + "learning_rate": 0.0005938375446391778, + "loss": 0.8424933, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.28051758, + "step": 2374, + "time_per_iteration": 2.7434608936309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124198, + "balance_loss_mlp": 1.09563541, + "epoch": 0.45690650250096193, + "flos": 502873906176.0, + "grad_norm": 0.06820583935841042, + "language_loss": 0.89043015, + "learning_rate": 0.0005935315198775415, + "loss": 0.90167212, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.28540039, + "step": 2375, + "time_per_iteration": 2.6057205200195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113332, + "balance_loss_mlp": 1.08558059, + "epoch": 0.45709888418622546, + "flos": 430473968640.0, + "grad_norm": 0.07601718344596131, + "language_loss": 0.87262166, + "learning_rate": 0.0005932254588037486, + "loss": 0.88375497, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.27783203, + "step": 2376, + "time_per_iteration": 2.4881751537323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103499, + "balance_loss_mlp": 1.07462692, + "epoch": 0.45729126587148905, + "flos": 525395579904.0, + "grad_norm": 0.07182864232109534, + "language_loss": 0.86405516, + "learning_rate": 0.000592919361536623, + "loss": 0.87509012, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.28857422, + "step": 2377, + "time_per_iteration": 2.6453545093536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07376885, + "epoch": 0.4574836475567526, + "flos": 637717349376.0, + "grad_norm": 0.06032083182665244, + "language_loss": 0.88920552, + "learning_rate": 0.0005926132281950017, + "loss": 0.90022385, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.28076172, + "step": 2378, + "time_per_iteration": 2.7356886863708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.0672735, + "epoch": 0.45767602924201617, + "flos": 649288811520.0, + "grad_norm": 0.07556174313152972, + "language_loss": 0.8485238, + "learning_rate": 0.0005923070588977367, + "loss": 0.8594898, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.29248047, + "step": 2379, + "time_per_iteration": 2.812110185623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095202, + "balance_loss_mlp": 1.0665921, + "epoch": 0.4578684109272797, + "flos": 746356543488.0, + "grad_norm": 0.0597594421207511, + "language_loss": 0.86065739, + "learning_rate": 0.0005920008537636931, + "loss": 0.87160945, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.28613281, + "step": 2380, + "time_per_iteration": 2.8955793380737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094751, + "balance_loss_mlp": 1.06518722, + "epoch": 0.4580607926125433, + "flos": 641155433472.0, + "grad_norm": 0.08202954174104495, + "language_loss": 0.86535549, + "learning_rate": 0.0005916946129117504, + "loss": 0.87630302, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.29516602, + "step": 2381, + "time_per_iteration": 2.8850152492523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.05958724, + "epoch": 0.4582531742978069, + "flos": 801513474048.0, + "grad_norm": 0.06022733145419036, + "language_loss": 0.80483937, + "learning_rate": 0.0005913883364608017, + "loss": 0.81573421, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.29833984, + "step": 2382, + "time_per_iteration": 3.0977792739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092347, + "balance_loss_mlp": 1.06225872, + "epoch": 0.4584455559830704, + "flos": 683991613440.0, + "grad_norm": 0.07912283694355432, + "language_loss": 0.88849449, + "learning_rate": 0.0005910820245297542, + "loss": 0.899418, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.30053711, + "step": 2383, + "time_per_iteration": 2.905977964401245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081098, + "balance_loss_mlp": 1.05055714, + "epoch": 0.458637937668334, + "flos": 517902391296.0, + "grad_norm": 0.06971122212551431, + "language_loss": 0.810808, + "learning_rate": 0.000590775677237529, + "loss": 0.82161897, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.30517578, + "step": 2384, + "time_per_iteration": 2.7233986854553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078055, + "balance_loss_mlp": 1.04810929, + "epoch": 0.4588303193535975, + "flos": 505242607104.0, + "grad_norm": 0.10145803635005178, + "language_loss": 0.79860461, + "learning_rate": 0.0005904692947030601, + "loss": 0.80938518, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.29882812, + "step": 2385, + "time_per_iteration": 2.602890729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04647207, + "epoch": 0.4590227010388611, + "flos": 495655732224.0, + "grad_norm": 0.08299143875661358, + "language_loss": 0.89372921, + "learning_rate": 0.0005901628770452963, + "loss": 0.90449417, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.29956055, + "step": 2386, + "time_per_iteration": 2.56011700630188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_mlp": 1.04586029, + "epoch": 0.45921508272412465, + "flos": 493375781376.0, + "grad_norm": 0.05953614440228025, + "language_loss": 0.87499726, + "learning_rate": 0.000589856424383199, + "loss": 0.88575506, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.29882812, + "step": 2387, + "time_per_iteration": 2.622857093811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078237, + "balance_loss_mlp": 1.04762435, + "epoch": 0.45940746440938823, + "flos": 691096306176.0, + "grad_norm": 0.06461384040637212, + "language_loss": 0.8283028, + "learning_rate": 0.000589549936835744, + "loss": 0.83908516, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.30566406, + "step": 2388, + "time_per_iteration": 2.9280176162719727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082083, + "balance_loss_mlp": 1.0514698, + "epoch": 0.45959984609465176, + "flos": 503489364480.0, + "grad_norm": 0.07025219360641571, + "language_loss": 0.79160953, + "learning_rate": 0.0005892434145219202, + "loss": 0.80243033, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.30566406, + "step": 2389, + "time_per_iteration": 2.632772207260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081464, + "balance_loss_mlp": 1.050946, + "epoch": 0.45979222777991535, + "flos": 676339863552.0, + "grad_norm": 0.060348492919292666, + "language_loss": 0.82535923, + "learning_rate": 0.0005889368575607303, + "loss": 0.83617389, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.3046875, + "step": 2390, + "time_per_iteration": 2.815487861633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094579, + "balance_loss_mlp": 1.06358492, + "epoch": 0.45998460946517894, + "flos": 777308368896.0, + "grad_norm": 0.05491617941274289, + "language_loss": 0.78348118, + "learning_rate": 0.00058863026607119, + "loss": 0.79442704, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.30957031, + "step": 2391, + "time_per_iteration": 3.0853166580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092193, + "balance_loss_mlp": 1.0620811, + "epoch": 0.46017699115044247, + "flos": 851073053184.0, + "grad_norm": 0.05825671270919626, + "language_loss": 0.79661655, + "learning_rate": 0.0005883236401723287, + "loss": 0.80753851, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.30078125, + "step": 2392, + "time_per_iteration": 3.1643104553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096169, + "balance_loss_mlp": 1.06536531, + "epoch": 0.46036937283570606, + "flos": 575608495104.0, + "grad_norm": 0.06457998167472197, + "language_loss": 0.84046978, + "learning_rate": 0.0005880169799831893, + "loss": 0.85143149, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.30761719, + "step": 2393, + "time_per_iteration": 2.6935391426086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096173, + "balance_loss_mlp": 1.0654645, + "epoch": 0.4605617545209696, + "flos": 611553798144.0, + "grad_norm": 0.06354744392782355, + "language_loss": 0.81838334, + "learning_rate": 0.0005877102856228278, + "loss": 0.82934511, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.30664062, + "step": 2394, + "time_per_iteration": 2.8314805030822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097821, + "balance_loss_mlp": 1.06713629, + "epoch": 0.4607541362062332, + "flos": 532884386304.0, + "grad_norm": 0.0665210460005036, + "language_loss": 0.84696203, + "learning_rate": 0.0005874035572103133, + "loss": 0.8579402, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.30664062, + "step": 2395, + "time_per_iteration": 2.6893725395202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098408, + "balance_loss_mlp": 1.0673902, + "epoch": 0.4609465178914967, + "flos": 647023417344.0, + "grad_norm": 0.1082823786036068, + "language_loss": 0.82554322, + "learning_rate": 0.0005870967948647288, + "loss": 0.83652729, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.30981445, + "step": 2396, + "time_per_iteration": 2.7625200748443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191183, + "balance_loss_mlp": 1.1745894, + "epoch": 0.4611388995767603, + "flos": 1465487202816.0, + "grad_norm": 0.05861502253959749, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75499487, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.16601562, + "step": 2397, + "time_per_iteration": 5.363407850265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090965, + "balance_loss_mlp": 1.06028056, + "epoch": 0.46133128126202383, + "flos": 722772688896.0, + "grad_norm": 0.08876233940236913, + "language_loss": 0.85477209, + "learning_rate": 0.0005864831688507443, + "loss": 0.86568171, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.30639648, + "step": 2398, + "time_per_iteration": 2.9619805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081398, + "balance_loss_mlp": 1.05119061, + "epoch": 0.4615236629472874, + "flos": 547735371264.0, + "grad_norm": 0.06931834879873142, + "language_loss": 0.75342947, + "learning_rate": 0.0005861763054205754, + "loss": 0.76424348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.30151367, + "step": 2399, + "time_per_iteration": 2.7531988620758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091818, + "balance_loss_mlp": 1.06213522, + "epoch": 0.461716044632551, + "flos": 601942192128.0, + "grad_norm": 0.05751461156756605, + "language_loss": 0.80467141, + "learning_rate": 0.0005858694085337976, + "loss": 0.81558955, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.29614258, + "step": 2400, + "time_per_iteration": 2.814182758331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083104, + "balance_loss_mlp": 1.05246735, + "epoch": 0.46190842631781454, + "flos": 474236937216.0, + "grad_norm": 0.07664119673877032, + "language_loss": 0.8354007, + "learning_rate": 0.0005855624783095589, + "loss": 0.8462317, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.30615234, + "step": 2401, + "time_per_iteration": 2.57083797454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083538, + "balance_loss_mlp": 1.05414128, + "epoch": 0.4621008080030781, + "flos": 437254184448.0, + "grad_norm": 0.06712435829168825, + "language_loss": 0.85380065, + "learning_rate": 0.00058525551486702, + "loss": 0.864636, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.29370117, + "step": 2402, + "time_per_iteration": 2.554870843887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084683, + "balance_loss_mlp": 1.05476141, + "epoch": 0.46229318968834165, + "flos": 525203523072.0, + "grad_norm": 0.06447976336023753, + "language_loss": 0.80940902, + "learning_rate": 0.0005849485183253548, + "loss": 0.82025588, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.29882812, + "step": 2403, + "time_per_iteration": 2.6398868560791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.05546916, + "epoch": 0.46248557137360524, + "flos": 439395922944.0, + "grad_norm": 0.07099246909711197, + "language_loss": 0.87546206, + "learning_rate": 0.0005846414888037501, + "loss": 0.88631094, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.29345703, + "step": 2404, + "time_per_iteration": 2.5056095123291016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086728, + "balance_loss_mlp": 1.05725932, + "epoch": 0.4626779530588688, + "flos": 617318475264.0, + "grad_norm": 0.052798237228442416, + "language_loss": 0.82345319, + "learning_rate": 0.0005843344264214049, + "loss": 0.83432049, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.29443359, + "step": 2405, + "time_per_iteration": 2.7549078464508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.06176221, + "epoch": 0.46287033474413236, + "flos": 669796784640.0, + "grad_norm": 0.05337180485738099, + "language_loss": 0.84920704, + "learning_rate": 0.0005840273312975317, + "loss": 0.8601203, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.29516602, + "step": 2406, + "time_per_iteration": 2.9058027267456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085122, + "balance_loss_mlp": 1.05577278, + "epoch": 0.46306271642939595, + "flos": 479992849920.0, + "grad_norm": 0.05333458165520064, + "language_loss": 0.89626014, + "learning_rate": 0.0005837202035513555, + "loss": 0.90711135, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.29345703, + "step": 2407, + "time_per_iteration": 2.5721802711486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094311, + "balance_loss_mlp": 1.06531978, + "epoch": 0.4632550981146595, + "flos": 580395359232.0, + "grad_norm": 0.0552743160267319, + "language_loss": 0.81124538, + "learning_rate": 0.0005834130433021136, + "loss": 0.8221885, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.28930664, + "step": 2408, + "time_per_iteration": 2.7402079105377197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109161, + "balance_loss_mlp": 1.06166446, + "epoch": 0.46344747979992307, + "flos": 523701974016.0, + "grad_norm": 0.09526074365649402, + "language_loss": 0.73246038, + "learning_rate": 0.0005831058506690563, + "loss": 0.74337649, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.29931641, + "step": 2409, + "time_per_iteration": 2.6229617595672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088655, + "balance_loss_mlp": 1.05875707, + "epoch": 0.4636398614851866, + "flos": 746174661120.0, + "grad_norm": 0.061078353708003665, + "language_loss": 0.85864687, + "learning_rate": 0.0005827986257714464, + "loss": 0.86953342, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.29858398, + "step": 2410, + "time_per_iteration": 2.9352338314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094131, + "balance_loss_mlp": 1.06404257, + "epoch": 0.4638322431704502, + "flos": 596273619456.0, + "grad_norm": 0.05695764594036898, + "language_loss": 0.88375425, + "learning_rate": 0.0005824913687285591, + "loss": 0.89469558, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.30078125, + "step": 2411, + "time_per_iteration": 2.6807737350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097526, + "balance_loss_mlp": 1.06698477, + "epoch": 0.4640246248557137, + "flos": 539172799488.0, + "grad_norm": 0.0643729084989199, + "language_loss": 0.81849819, + "learning_rate": 0.0005821840796596821, + "loss": 0.82947344, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.30493164, + "step": 2412, + "time_per_iteration": 2.663177967071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096211, + "balance_loss_mlp": 1.0657649, + "epoch": 0.4642170065409773, + "flos": 562330280448.0, + "grad_norm": 0.07601159389817994, + "language_loss": 0.80307502, + "learning_rate": 0.0005818767586841158, + "loss": 0.81403708, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.30419922, + "step": 2413, + "time_per_iteration": 2.7600111961364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092616, + "balance_loss_mlp": 1.06233692, + "epoch": 0.46440938822624084, + "flos": 530684421120.0, + "grad_norm": 0.059484167412089096, + "language_loss": 0.86110759, + "learning_rate": 0.0005815694059211726, + "loss": 0.87203372, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.30249023, + "step": 2414, + "time_per_iteration": 2.65578031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148176, + "balance_loss_mlp": 1.13263142, + "epoch": 0.4646017699115044, + "flos": 1525503780864.0, + "grad_norm": 0.0462911781552321, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.82021809, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.15527344, + "step": 2415, + "time_per_iteration": 4.8046934604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_mlp": 1.10092187, + "epoch": 0.464794151596768, + "flos": 1539999765504.0, + "grad_norm": 0.038481348382240925, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.78060573, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.14550781, + "step": 2416, + "time_per_iteration": 4.977246999740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085994, + "balance_loss_mlp": 1.05554748, + "epoch": 0.46498653328203154, + "flos": 501200649216.0, + "grad_norm": 0.07046148078843767, + "language_loss": 0.85802382, + "learning_rate": 0.0005806471581013931, + "loss": 0.86888373, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.30395508, + "step": 2417, + "time_per_iteration": 2.7680604457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084029, + "balance_loss_mlp": 1.05363095, + "epoch": 0.46517891496729513, + "flos": 675856825344.0, + "grad_norm": 0.061868019756872866, + "language_loss": 0.78540701, + "learning_rate": 0.0005803396793823146, + "loss": 0.7962473, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.30371094, + "step": 2418, + "time_per_iteration": 2.818821430206299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081583, + "balance_loss_mlp": 1.05213845, + "epoch": 0.46537129665255866, + "flos": 585062949888.0, + "grad_norm": 0.08069009721002836, + "language_loss": 0.8594386, + "learning_rate": 0.0005800321694726065, + "loss": 0.8702544, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.29418945, + "step": 2419, + "time_per_iteration": 2.812563896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085011, + "balance_loss_mlp": 1.05454159, + "epoch": 0.46556367833782225, + "flos": 587425858560.0, + "grad_norm": 0.061646313113324705, + "language_loss": 0.86883628, + "learning_rate": 0.0005797246284916545, + "loss": 0.87968636, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.30444336, + "step": 2420, + "time_per_iteration": 2.6945559978485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_mlp": 1.02332675, + "epoch": 0.4657560600230858, + "flos": 1484674099200.0, + "grad_norm": 0.024509703594541715, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78539675, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.11181641, + "step": 2421, + "time_per_iteration": 5.001375436782837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089527, + "balance_loss_mlp": 1.06036878, + "epoch": 0.46594844170834937, + "flos": 579961783296.0, + "grad_norm": 0.07023208249232396, + "language_loss": 0.8781141, + "learning_rate": 0.0005791094537936233, + "loss": 0.88900936, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.29150391, + "step": 2422, + "time_per_iteration": 2.703678846359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010888, + "balance_loss_mlp": 1.06028509, + "epoch": 0.4661408233936129, + "flos": 512322568704.0, + "grad_norm": 0.06283657209164231, + "language_loss": 0.817285, + "learning_rate": 0.0005788018203153762, + "loss": 0.82817304, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.28515625, + "step": 2423, + "time_per_iteration": 2.6398653984069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081237, + "balance_loss_mlp": 1.05255485, + "epoch": 0.4663332050788765, + "flos": 490839754752.0, + "grad_norm": 0.0646507393923986, + "language_loss": 0.85720015, + "learning_rate": 0.000578494156243549, + "loss": 0.86801249, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.28686523, + "step": 2424, + "time_per_iteration": 2.6061441898345947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086736, + "balance_loss_mlp": 1.05695724, + "epoch": 0.4665255867641401, + "flos": 512353092096.0, + "grad_norm": 0.05149395612804314, + "language_loss": 0.89174867, + "learning_rate": 0.0005781864616975878, + "loss": 0.90261602, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.29736328, + "step": 2425, + "time_per_iteration": 2.7073817253112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05917215, + "epoch": 0.4667179684494036, + "flos": 424590018048.0, + "grad_norm": 0.0742004751674347, + "language_loss": 0.84101117, + "learning_rate": 0.0005778787367969502, + "loss": 0.85188806, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.28515625, + "step": 2426, + "time_per_iteration": 2.643342971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082589, + "balance_loss_mlp": 1.05374038, + "epoch": 0.4669103501346672, + "flos": 707640897024.0, + "grad_norm": 0.05195761556147334, + "language_loss": 0.80815637, + "learning_rate": 0.0005775709816611053, + "loss": 0.81898224, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.28857422, + "step": 2427, + "time_per_iteration": 3.0103423595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085111, + "balance_loss_mlp": 1.05604792, + "epoch": 0.4671027318199307, + "flos": 554554874880.0, + "grad_norm": 0.05192902090033842, + "language_loss": 0.83742678, + "learning_rate": 0.0005772631964095346, + "loss": 0.84827781, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.29003906, + "step": 2428, + "time_per_iteration": 4.2191994190216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010894, + "balance_loss_mlp": 1.06107569, + "epoch": 0.4672951135051943, + "flos": 566839309824.0, + "grad_norm": 0.05894584384100732, + "language_loss": 0.85613596, + "learning_rate": 0.000576955381161731, + "loss": 0.86702996, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.28320312, + "step": 2429, + "time_per_iteration": 2.7035927772521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.05297327, + "epoch": 0.46748749519045785, + "flos": 424294654464.0, + "grad_norm": 0.07711305585297333, + "language_loss": 0.8606714, + "learning_rate": 0.0005766475360371985, + "loss": 0.87149525, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.29394531, + "step": 2430, + "time_per_iteration": 2.5702948570251465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092231, + "balance_loss_mlp": 1.06292963, + "epoch": 0.46767987687572143, + "flos": 538088859648.0, + "grad_norm": 0.08342834969675962, + "language_loss": 0.84959614, + "learning_rate": 0.0005763396611554536, + "loss": 0.86051846, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.29248047, + "step": 2431, + "time_per_iteration": 2.6236841678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092277, + "balance_loss_mlp": 1.06383383, + "epoch": 0.467872258560985, + "flos": 823360052736.0, + "grad_norm": 0.06223220956170435, + "language_loss": 0.80269897, + "learning_rate": 0.0005760317566360237, + "loss": 0.81362176, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.28466797, + "step": 2432, + "time_per_iteration": 3.0205023288726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084277, + "balance_loss_mlp": 1.0559535, + "epoch": 0.46806464024624855, + "flos": 661366632960.0, + "grad_norm": 0.058294757950733474, + "language_loss": 0.85130137, + "learning_rate": 0.000575723822598448, + "loss": 0.86214417, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.28295898, + "step": 2433, + "time_per_iteration": 2.79516339302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086726, + "balance_loss_mlp": 1.05866385, + "epoch": 0.46825702193151214, + "flos": 755362865664.0, + "grad_norm": 0.06256497191901454, + "language_loss": 0.81601393, + "learning_rate": 0.0005754158591622773, + "loss": 0.82688123, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.28076172, + "step": 2434, + "time_per_iteration": 2.963247537612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092504, + "balance_loss_mlp": 1.06365538, + "epoch": 0.4684494036167757, + "flos": 439164578304.0, + "grad_norm": 0.08333045297400817, + "language_loss": 0.8228929, + "learning_rate": 0.0005751078664470732, + "loss": 0.83381796, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.28833008, + "step": 2435, + "time_per_iteration": 2.537179470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_mlp": 1.05688024, + "epoch": 0.46864178530203926, + "flos": 532446428160.0, + "grad_norm": 0.08080859282065189, + "language_loss": 0.85670036, + "learning_rate": 0.0005747998445724094, + "loss": 0.86755049, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.28125, + "step": 2436, + "time_per_iteration": 2.6276183128356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083485, + "balance_loss_mlp": 1.05466008, + "epoch": 0.4688341669873028, + "flos": 576328670208.0, + "grad_norm": 0.08810611044699188, + "language_loss": 0.89099967, + "learning_rate": 0.0005744917936578707, + "loss": 0.90183449, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.28808594, + "step": 2437, + "time_per_iteration": 2.784236431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085755, + "balance_loss_mlp": 1.05690634, + "epoch": 0.4690265486725664, + "flos": 539296455168.0, + "grad_norm": 0.08777270325229546, + "language_loss": 0.83928555, + "learning_rate": 0.0005741837138230526, + "loss": 0.85014307, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.28808594, + "step": 2438, + "time_per_iteration": 2.7139840126037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078469, + "balance_loss_mlp": 1.05014467, + "epoch": 0.4692189303578299, + "flos": 770168770560.0, + "grad_norm": 0.053438427497709357, + "language_loss": 0.86270201, + "learning_rate": 0.0005738756051875627, + "loss": 0.87348676, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.28295898, + "step": 2439, + "time_per_iteration": 3.092337131500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074485, + "balance_loss_mlp": 1.04551697, + "epoch": 0.4694113120430935, + "flos": 571118404608.0, + "grad_norm": 0.056335724754341315, + "language_loss": 0.83459938, + "learning_rate": 0.0005735674678710192, + "loss": 0.84534419, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.28930664, + "step": 2440, + "time_per_iteration": 2.6729819774627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107755, + "balance_loss_mlp": 1.0473665, + "epoch": 0.4696036937283571, + "flos": 748498281984.0, + "grad_norm": 0.06862136292067082, + "language_loss": 0.80992246, + "learning_rate": 0.0005732593019930517, + "loss": 0.82069802, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.30126953, + "step": 2441, + "time_per_iteration": 2.917332649230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078244, + "balance_loss_mlp": 1.04779828, + "epoch": 0.4697960754136206, + "flos": 493208455680.0, + "grad_norm": 0.06788307957029095, + "language_loss": 0.8767302, + "learning_rate": 0.0005729511076733008, + "loss": 0.88751262, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.30395508, + "step": 2442, + "time_per_iteration": 2.6602578163146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108041, + "balance_loss_mlp": 1.05003536, + "epoch": 0.4699884570988842, + "flos": 724809710592.0, + "grad_norm": 0.08414136163770505, + "language_loss": 0.84802854, + "learning_rate": 0.000572642885031418, + "loss": 0.85883266, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.30322266, + "step": 2443, + "time_per_iteration": 2.924572706222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075591, + "balance_loss_mlp": 1.04516852, + "epoch": 0.47018083878414774, + "flos": 555141219840.0, + "grad_norm": 0.055800438037163856, + "language_loss": 0.80518812, + "learning_rate": 0.0005723346341870662, + "loss": 0.81594402, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.30371094, + "step": 2444, + "time_per_iteration": 2.7203280925750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082669, + "balance_loss_mlp": 1.05217505, + "epoch": 0.4703732204694113, + "flos": 423846521856.0, + "grad_norm": 0.06929087535104682, + "language_loss": 0.86297798, + "learning_rate": 0.0005720263552599188, + "loss": 0.87380457, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.30444336, + "step": 2445, + "time_per_iteration": 2.469621419906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075882, + "balance_loss_mlp": 1.0456984, + "epoch": 0.47056560215467486, + "flos": 703179919872.0, + "grad_norm": 0.06843850090218344, + "language_loss": 0.79142129, + "learning_rate": 0.0005717180483696604, + "loss": 0.80218005, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.30151367, + "step": 2446, + "time_per_iteration": 2.9089763164520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072219, + "balance_loss_mlp": 1.04034209, + "epoch": 0.47075798383993844, + "flos": 554701851648.0, + "grad_norm": 0.07381367232784701, + "language_loss": 0.83118802, + "learning_rate": 0.0005714097136359862, + "loss": 0.84191024, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.31860352, + "step": 2447, + "time_per_iteration": 2.6346585750579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079268, + "balance_loss_mlp": 1.04817808, + "epoch": 0.470950365525202, + "flos": 564009329664.0, + "grad_norm": 0.06979677359463858, + "language_loss": 0.86324209, + "learning_rate": 0.0005711013511786027, + "loss": 0.87403476, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.31054688, + "step": 2448, + "time_per_iteration": 2.765740156173706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073046, + "balance_loss_mlp": 1.0426712, + "epoch": 0.47114274721046556, + "flos": 534189496320.0, + "grad_norm": 0.048536468835106476, + "language_loss": 0.84014428, + "learning_rate": 0.0005707929611172263, + "loss": 0.85087478, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.3034668, + "step": 2449, + "time_per_iteration": 2.6891775131225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074493, + "balance_loss_mlp": 1.04349887, + "epoch": 0.47133512889572915, + "flos": 472877982720.0, + "grad_norm": 0.05569215031080998, + "language_loss": 0.83788037, + "learning_rate": 0.000570484543571585, + "loss": 0.84862536, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.30957031, + "step": 2450, + "time_per_iteration": 2.545646905899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076975, + "balance_loss_mlp": 1.04743469, + "epoch": 0.4715275105809927, + "flos": 458776286208.0, + "grad_norm": 0.06210999897734131, + "language_loss": 0.82771122, + "learning_rate": 0.0005701760986614171, + "loss": 0.83848095, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.29492188, + "step": 2451, + "time_per_iteration": 2.5739784240722656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080958, + "balance_loss_mlp": 1.05256283, + "epoch": 0.47171989226625627, + "flos": 421783358976.0, + "grad_norm": 0.06034093462601522, + "language_loss": 0.87343812, + "learning_rate": 0.0005698676265064714, + "loss": 0.88424772, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.28393555, + "step": 2452, + "time_per_iteration": 2.5456669330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085544, + "balance_loss_mlp": 1.05612302, + "epoch": 0.4719122739515198, + "flos": 457200543744.0, + "grad_norm": 0.12010658803535784, + "language_loss": 0.88854802, + "learning_rate": 0.0005695591272265074, + "loss": 0.89940351, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.29370117, + "step": 2453, + "time_per_iteration": 2.53247332572937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086025, + "balance_loss_mlp": 1.05610394, + "epoch": 0.4721046556367834, + "flos": 514716000768.0, + "grad_norm": 0.06319040539886057, + "language_loss": 0.81670743, + "learning_rate": 0.0005692506009412954, + "loss": 0.8275677, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.29907227, + "step": 2454, + "time_per_iteration": 2.663959503173828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157874, + "balance_loss_mlp": 1.14423668, + "epoch": 0.4722970373220469, + "flos": 1571399723520.0, + "grad_norm": 0.046124065416459865, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78709137, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.13671875, + "step": 2455, + "time_per_iteration": 4.937524795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085858, + "balance_loss_mlp": 1.05603182, + "epoch": 0.4724894190073105, + "flos": 585919927296.0, + "grad_norm": 0.07174058927835297, + "language_loss": 0.89622641, + "learning_rate": 0.0005686334678342593, + "loss": 0.907085, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.2980957, + "step": 2456, + "time_per_iteration": 2.9060487747192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077496, + "balance_loss_mlp": 1.04824257, + "epoch": 0.4726818006925741, + "flos": 867290347008.0, + "grad_norm": 0.07069871267474889, + "language_loss": 0.81667411, + "learning_rate": 0.0005683248612520274, + "loss": 0.82744908, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.29223633, + "step": 2457, + "time_per_iteration": 3.071544885635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084366, + "balance_loss_mlp": 1.05465865, + "epoch": 0.4728741823778376, + "flos": 752653721088.0, + "grad_norm": 0.07071545002601118, + "language_loss": 0.83683658, + "learning_rate": 0.0005680162281437321, + "loss": 0.84768021, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.296875, + "step": 2458, + "time_per_iteration": 2.931579113006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077685, + "balance_loss_mlp": 1.0476439, + "epoch": 0.4730665640631012, + "flos": 538301265408.0, + "grad_norm": 0.06018673388195985, + "language_loss": 0.84837544, + "learning_rate": 0.000567707568629195, + "loss": 0.85915226, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.30004883, + "step": 2459, + "time_per_iteration": 2.6860852241516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079226, + "balance_loss_mlp": 1.04968619, + "epoch": 0.47325894574836475, + "flos": 491396986368.0, + "grad_norm": 0.053752412093893094, + "language_loss": 0.82513988, + "learning_rate": 0.0005673988828282486, + "loss": 0.83593214, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.29467773, + "step": 2460, + "time_per_iteration": 2.6679980754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073438, + "balance_loss_mlp": 1.04320669, + "epoch": 0.47345132743362833, + "flos": 764117494272.0, + "grad_norm": 0.05735836881189746, + "language_loss": 0.80829632, + "learning_rate": 0.0005670901708607352, + "loss": 0.81903076, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.30175781, + "step": 2461, + "time_per_iteration": 2.962364673614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076898, + "balance_loss_mlp": 1.04635668, + "epoch": 0.47364370911889186, + "flos": 539925060096.0, + "grad_norm": 0.06660215000338995, + "language_loss": 0.84026098, + "learning_rate": 0.0005667814328465076, + "loss": 0.85102999, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.30493164, + "step": 2462, + "time_per_iteration": 2.6148030757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077856, + "balance_loss_mlp": 1.04824424, + "epoch": 0.47383609080415545, + "flos": 406002613248.0, + "grad_norm": 0.0820641824195461, + "language_loss": 0.81702316, + "learning_rate": 0.0005664726689054285, + "loss": 0.8278017, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.29541016, + "step": 2463, + "time_per_iteration": 2.46337628364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.04910851, + "epoch": 0.474028472489419, + "flos": 453237161472.0, + "grad_norm": 0.07270387927239072, + "language_loss": 0.81341946, + "learning_rate": 0.0005661638791573704, + "loss": 0.82421935, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.30859375, + "step": 2464, + "time_per_iteration": 2.712188720703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084787, + "balance_loss_mlp": 1.05453193, + "epoch": 0.47422085417468257, + "flos": 491923694592.0, + "grad_norm": 0.05714322793938323, + "language_loss": 0.87222457, + "learning_rate": 0.0005658550637222164, + "loss": 0.88307238, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.30224609, + "step": 2465, + "time_per_iteration": 2.63380765914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082927, + "balance_loss_mlp": 1.05298185, + "epoch": 0.47441323585994616, + "flos": 738537467904.0, + "grad_norm": 0.06339144108901118, + "language_loss": 0.82493532, + "learning_rate": 0.0005655462227198592, + "loss": 0.83576465, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.29907227, + "step": 2466, + "time_per_iteration": 2.910783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084848, + "balance_loss_mlp": 1.0547595, + "epoch": 0.4746056175452097, + "flos": 484439270400.0, + "grad_norm": 0.05460968765214119, + "language_loss": 0.83975738, + "learning_rate": 0.0005652373562702016, + "loss": 0.85060585, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.30053711, + "step": 2467, + "time_per_iteration": 2.6101505756378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.05072081, + "epoch": 0.4747979992304733, + "flos": 460814717952.0, + "grad_norm": 0.06618054462006194, + "language_loss": 0.88145614, + "learning_rate": 0.000564928464493156, + "loss": 0.89226621, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.30249023, + "step": 2468, + "time_per_iteration": 2.55812668800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081635, + "balance_loss_mlp": 1.05247641, + "epoch": 0.4749903809157368, + "flos": 864070460928.0, + "grad_norm": 0.06741069565287812, + "language_loss": 0.81633413, + "learning_rate": 0.000564619547508645, + "loss": 0.82715052, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.29150391, + "step": 2469, + "time_per_iteration": 3.1341404914855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082878, + "balance_loss_mlp": 1.05252695, + "epoch": 0.4751827626010004, + "flos": 505296451584.0, + "grad_norm": 0.0651779420020333, + "language_loss": 0.83088791, + "learning_rate": 0.0005643106054366008, + "loss": 0.84171665, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.30297852, + "step": 2470, + "time_per_iteration": 2.610891342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076033, + "balance_loss_mlp": 1.04666018, + "epoch": 0.47537514428626393, + "flos": 559123540992.0, + "grad_norm": 0.0714119485898344, + "language_loss": 0.79053152, + "learning_rate": 0.000564001638396965, + "loss": 0.80129188, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.29321289, + "step": 2471, + "time_per_iteration": 2.7754971981048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083604, + "balance_loss_mlp": 1.05430186, + "epoch": 0.4755675259715275, + "flos": 833907211776.0, + "grad_norm": 0.05565021284268994, + "language_loss": 0.8203246, + "learning_rate": 0.0005636926465096897, + "loss": 0.83116066, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.29248047, + "step": 2472, + "time_per_iteration": 3.028235912322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079414, + "balance_loss_mlp": 1.05116105, + "epoch": 0.47575990765679105, + "flos": 507989629440.0, + "grad_norm": 0.06838176056824781, + "language_loss": 0.87627274, + "learning_rate": 0.0005633836298947363, + "loss": 0.8870669, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.28271484, + "step": 2473, + "time_per_iteration": 2.609142303466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04901338, + "epoch": 0.47595228934205464, + "flos": 591566740992.0, + "grad_norm": 0.06111056533479294, + "language_loss": 0.70809621, + "learning_rate": 0.000563074588672075, + "loss": 0.71887386, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.28759766, + "step": 2474, + "time_per_iteration": 2.722593069076538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079389, + "balance_loss_mlp": 1.05080247, + "epoch": 0.4761446710273182, + "flos": 580340104704.0, + "grad_norm": 0.06296236889432077, + "language_loss": 0.85321903, + "learning_rate": 0.0005627655229616868, + "loss": 0.8640129, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.28540039, + "step": 2475, + "time_per_iteration": 2.711296558380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081174, + "balance_loss_mlp": 1.05141973, + "epoch": 0.47633705271258175, + "flos": 672597651456.0, + "grad_norm": 0.06122384611792148, + "language_loss": 0.89890903, + "learning_rate": 0.0005624564328835616, + "loss": 0.90972078, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.29736328, + "step": 2476, + "time_per_iteration": 2.796614408493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_mlp": 1.05069184, + "epoch": 0.47652943439784534, + "flos": 541580788224.0, + "grad_norm": 0.05962569805242902, + "language_loss": 0.84079456, + "learning_rate": 0.0005621473185576986, + "loss": 0.85158479, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.28344727, + "step": 2477, + "time_per_iteration": 2.7140815258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086205, + "balance_loss_mlp": 1.05709434, + "epoch": 0.4767218160831089, + "flos": 524563333632.0, + "grad_norm": 0.07093607725441804, + "language_loss": 0.87060082, + "learning_rate": 0.0005618381801041068, + "loss": 0.88146281, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.29077148, + "step": 2478, + "time_per_iteration": 2.596932888031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085469, + "balance_loss_mlp": 1.05638218, + "epoch": 0.47691419776837246, + "flos": 567789419520.0, + "grad_norm": 0.07057707739429774, + "language_loss": 0.83022285, + "learning_rate": 0.0005615290176428044, + "loss": 0.84107757, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.29052734, + "step": 2479, + "time_per_iteration": 2.6407430171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108759, + "balance_loss_mlp": 1.05828834, + "epoch": 0.477106579453636, + "flos": 530659689984.0, + "grad_norm": 0.06449831218896054, + "language_loss": 0.85197705, + "learning_rate": 0.0005612198312938187, + "loss": 0.86285299, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.29296875, + "step": 2480, + "time_per_iteration": 2.7345011234283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108973, + "balance_loss_mlp": 1.06121504, + "epoch": 0.4772989611388996, + "flos": 593980521984.0, + "grad_norm": 0.060218704260060575, + "language_loss": 0.79185855, + "learning_rate": 0.0005609106211771868, + "loss": 0.80275583, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.28540039, + "step": 2481, + "time_per_iteration": 2.8754329681396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088266, + "balance_loss_mlp": 1.05908394, + "epoch": 0.4774913428241631, + "flos": 544352541696.0, + "grad_norm": 0.07327776648741448, + "language_loss": 0.89180911, + "learning_rate": 0.0005606013874129543, + "loss": 0.90269172, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.29199219, + "step": 2482, + "time_per_iteration": 2.7726404666900635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090058, + "balance_loss_mlp": 1.06049454, + "epoch": 0.4776837245094267, + "flos": 539817371136.0, + "grad_norm": 0.06456332848164101, + "language_loss": 0.79976207, + "learning_rate": 0.0005602921301211768, + "loss": 0.81066263, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.29516602, + "step": 2483, + "time_per_iteration": 2.715306043624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089436, + "balance_loss_mlp": 1.06132603, + "epoch": 0.4778761061946903, + "flos": 471543759360.0, + "grad_norm": 0.07998801300028703, + "language_loss": 0.82180744, + "learning_rate": 0.0005599828494219185, + "loss": 0.83270174, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.28100586, + "step": 2484, + "time_per_iteration": 2.5683019161224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086424, + "balance_loss_mlp": 1.05836201, + "epoch": 0.4780684878799538, + "flos": 725769994752.0, + "grad_norm": 0.06543459725570545, + "language_loss": 0.88914174, + "learning_rate": 0.0005596735454352527, + "loss": 0.90000606, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.28076172, + "step": 2485, + "time_per_iteration": 2.8615424633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083119, + "balance_loss_mlp": 1.05531943, + "epoch": 0.4782608695652174, + "flos": 548665132032.0, + "grad_norm": 0.07228586186756063, + "language_loss": 0.85170126, + "learning_rate": 0.0005593642182812619, + "loss": 0.8625325, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.27856445, + "step": 2486, + "time_per_iteration": 2.6507115364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084435, + "balance_loss_mlp": 1.0574224, + "epoch": 0.47845325125048094, + "flos": 829555333632.0, + "grad_norm": 0.06671866930909515, + "language_loss": 0.83972216, + "learning_rate": 0.0005590548680800378, + "loss": 0.85056645, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.27050781, + "step": 2487, + "time_per_iteration": 3.0963587760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085422, + "balance_loss_mlp": 1.05755091, + "epoch": 0.4786456329357445, + "flos": 513889546752.0, + "grad_norm": 0.0627787894989405, + "language_loss": 0.7639966, + "learning_rate": 0.0005587454949516804, + "loss": 0.77485085, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.27880859, + "step": 2488, + "time_per_iteration": 2.704761266708374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085753, + "balance_loss_mlp": 1.05719018, + "epoch": 0.47883801462100806, + "flos": 564392033280.0, + "grad_norm": 0.07191070894190046, + "language_loss": 0.87996674, + "learning_rate": 0.0005584360990162993, + "loss": 0.89082426, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.28540039, + "step": 2489, + "time_per_iteration": 2.68680477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108742, + "balance_loss_mlp": 1.05921531, + "epoch": 0.47903039630627164, + "flos": 579296862720.0, + "grad_norm": 0.052754850289178916, + "language_loss": 0.85114515, + "learning_rate": 0.0005581266803940124, + "loss": 0.86201936, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.28222656, + "step": 2490, + "time_per_iteration": 2.7187392711639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091263, + "balance_loss_mlp": 1.06322539, + "epoch": 0.47922277799153523, + "flos": 618667255296.0, + "grad_norm": 0.061347112520969346, + "language_loss": 0.87164974, + "learning_rate": 0.0005578172392049471, + "loss": 0.8825624, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.28051758, + "step": 2491, + "time_per_iteration": 2.7291457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089047, + "balance_loss_mlp": 1.06048441, + "epoch": 0.47941515967679876, + "flos": 639352728576.0, + "grad_norm": 0.07263845202824909, + "language_loss": 0.84244549, + "learning_rate": 0.0005575077755692386, + "loss": 0.85333598, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.28564453, + "step": 2492, + "time_per_iteration": 2.8026599884033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080078, + "balance_loss_mlp": 1.05246925, + "epoch": 0.47960754136206235, + "flos": 519561091584.0, + "grad_norm": 0.0504022340685432, + "language_loss": 0.85800493, + "learning_rate": 0.0005571982896070316, + "loss": 0.86880577, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.27612305, + "step": 2493, + "time_per_iteration": 2.655550003051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080752, + "balance_loss_mlp": 1.05266619, + "epoch": 0.4797999230473259, + "flos": 474798551040.0, + "grad_norm": 0.11668407926682704, + "language_loss": 0.89753431, + "learning_rate": 0.0005568887814384792, + "loss": 0.90834183, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.28100586, + "step": 2494, + "time_per_iteration": 2.5966434478759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080843, + "balance_loss_mlp": 1.05337763, + "epoch": 0.47999230473258947, + "flos": 531766950912.0, + "grad_norm": 0.058142169565221447, + "language_loss": 0.87224984, + "learning_rate": 0.000556579251183743, + "loss": 0.88305831, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.27490234, + "step": 2495, + "time_per_iteration": 2.6536028385162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080101, + "balance_loss_mlp": 1.05089474, + "epoch": 0.480184686417853, + "flos": 601207460352.0, + "grad_norm": 0.06356237967295801, + "language_loss": 0.7994827, + "learning_rate": 0.0005562696989629936, + "loss": 0.81028366, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.29174805, + "step": 2496, + "time_per_iteration": 2.691530466079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_mlp": 1.05328333, + "epoch": 0.4803770681031166, + "flos": 527931606528.0, + "grad_norm": 0.07544069195311896, + "language_loss": 0.82662058, + "learning_rate": 0.0005559601248964095, + "loss": 0.83744615, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.29223633, + "step": 2497, + "time_per_iteration": 2.687108278274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078067, + "balance_loss_mlp": 1.04931426, + "epoch": 0.4805694497883801, + "flos": 510934500864.0, + "grad_norm": 0.07160134617119021, + "language_loss": 0.85915172, + "learning_rate": 0.0005556505291041783, + "loss": 0.86993241, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.28735352, + "step": 2498, + "time_per_iteration": 2.7002923488616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081443, + "balance_loss_mlp": 1.05264211, + "epoch": 0.4807618314736437, + "flos": 600027416064.0, + "grad_norm": 0.21407023754506424, + "language_loss": 0.84214193, + "learning_rate": 0.0005553409117064954, + "loss": 0.85295641, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.2878418, + "step": 2499, + "time_per_iteration": 2.877713203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096264, + "balance_loss_mlp": 1.06824946, + "epoch": 0.4809542131589073, + "flos": 568700241408.0, + "grad_norm": 0.06103635462331165, + "language_loss": 0.84855151, + "learning_rate": 0.0005550312728235654, + "loss": 0.85951412, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.28051758, + "step": 2500, + "time_per_iteration": 2.716524362564087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094238, + "balance_loss_mlp": 1.06610465, + "epoch": 0.4811465948441708, + "flos": 575703037440.0, + "grad_norm": 0.07633647670380422, + "language_loss": 0.83599609, + "learning_rate": 0.0005547216125756003, + "loss": 0.84693843, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.28125, + "step": 2501, + "time_per_iteration": 2.8102688789367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097276, + "balance_loss_mlp": 1.06899917, + "epoch": 0.4813389765294344, + "flos": 823508439552.0, + "grad_norm": 0.05816521463755192, + "language_loss": 0.81801546, + "learning_rate": 0.0005544119310828211, + "loss": 0.82898819, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.28295898, + "step": 2502, + "time_per_iteration": 3.09083890914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110256, + "balance_loss_mlp": 1.08162141, + "epoch": 0.48153135821469795, + "flos": 635240959488.0, + "grad_norm": 0.07468975257849066, + "language_loss": 0.84463918, + "learning_rate": 0.0005541022284654568, + "loss": 0.85574174, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.28613281, + "step": 2503, + "time_per_iteration": 2.959812641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105243, + "balance_loss_mlp": 1.07613182, + "epoch": 0.48172373989996153, + "flos": 503450076672.0, + "grad_norm": 0.06287004960739773, + "language_loss": 0.83878344, + "learning_rate": 0.0005537925048437446, + "loss": 0.84983587, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.29077148, + "step": 2504, + "time_per_iteration": 2.5965919494628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113897, + "balance_loss_mlp": 1.12542796, + "epoch": 0.48191612158522507, + "flos": 1531563821568.0, + "grad_norm": 0.039351692623908835, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76890433, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.13574219, + "step": 2505, + "time_per_iteration": 4.965132713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112409, + "balance_loss_mlp": 1.08420432, + "epoch": 0.48210850327048865, + "flos": 702078451200.0, + "grad_norm": 0.06703534425937603, + "language_loss": 0.88412756, + "learning_rate": 0.0005531729950682664, + "loss": 0.89525163, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.28198242, + "step": 2506, + "time_per_iteration": 3.032463550567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107907, + "balance_loss_mlp": 1.07936859, + "epoch": 0.4823008849557522, + "flos": 439548691968.0, + "grad_norm": 0.08139997578259908, + "language_loss": 0.84598732, + "learning_rate": 0.000552863209155015, + "loss": 0.85706639, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.28564453, + "step": 2507, + "time_per_iteration": 2.501650333404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101488, + "balance_loss_mlp": 1.07285357, + "epoch": 0.48249326664101577, + "flos": 471622334976.0, + "grad_norm": 0.06119014713123412, + "language_loss": 0.81909472, + "learning_rate": 0.0005525534027184461, + "loss": 0.83010966, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.28637695, + "step": 2508, + "time_per_iteration": 2.5787370204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098365, + "balance_loss_mlp": 1.06942117, + "epoch": 0.48268564832627936, + "flos": 562954503168.0, + "grad_norm": 0.05313984540081721, + "language_loss": 0.82654703, + "learning_rate": 0.0005522435758788365, + "loss": 0.83753073, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.28930664, + "step": 2509, + "time_per_iteration": 2.7109761238098145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010953, + "balance_loss_mlp": 1.06730938, + "epoch": 0.4828780300115429, + "flos": 629298782208.0, + "grad_norm": 0.05877851050813853, + "language_loss": 0.80259538, + "learning_rate": 0.0005519337287564721, + "loss": 0.81354833, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.2800293, + "step": 2510, + "time_per_iteration": 2.8329310417175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109601, + "balance_loss_mlp": 1.06759048, + "epoch": 0.4830704116968065, + "flos": 631562766336.0, + "grad_norm": 0.060327319620096846, + "language_loss": 0.83688086, + "learning_rate": 0.000551623861471646, + "loss": 0.84784102, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.28417969, + "step": 2511, + "time_per_iteration": 2.7470946311950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100715, + "balance_loss_mlp": 1.08784056, + "epoch": 0.48326279338207, + "flos": 1568434503168.0, + "grad_norm": 0.03397215547055983, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79919541, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.12890625, + "step": 2512, + "time_per_iteration": 4.837340593338013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095094, + "balance_loss_mlp": 1.06619751, + "epoch": 0.4834551750673336, + "flos": 508989201408.0, + "grad_norm": 0.059215268588021376, + "language_loss": 0.86540532, + "learning_rate": 0.0005510040668958211, + "loss": 0.87635624, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.2890625, + "step": 2513, + "time_per_iteration": 2.5706045627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107614, + "balance_loss_mlp": 1.06364644, + "epoch": 0.48364755675259713, + "flos": 1527875453952.0, + "grad_norm": 0.0265804362292035, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78836721, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.12451172, + "step": 2514, + "time_per_iteration": 4.899883508682251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.0589062, + "epoch": 0.4838399384378607, + "flos": 564726684672.0, + "grad_norm": 0.05909251781800444, + "language_loss": 0.83435559, + "learning_rate": 0.0005503841931138645, + "loss": 0.84523714, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.29272461, + "step": 2515, + "time_per_iteration": 2.665804386138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.06112456, + "epoch": 0.4840323201231243, + "flos": 387479227392.0, + "grad_norm": 0.06787127022085944, + "language_loss": 0.81963372, + "learning_rate": 0.0005500742268214025, + "loss": 0.8305335, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.28833008, + "step": 2516, + "time_per_iteration": 2.5123801231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084372, + "balance_loss_mlp": 1.05487967, + "epoch": 0.48422470180838784, + "flos": 630701406720.0, + "grad_norm": 0.05799188255481874, + "language_loss": 0.85305762, + "learning_rate": 0.0005497642410884014, + "loss": 0.86390138, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.29492188, + "step": 2517, + "time_per_iteration": 2.818969249725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107799, + "balance_loss_mlp": 1.04907012, + "epoch": 0.4844170834936514, + "flos": 498955603968.0, + "grad_norm": 0.0575391439282783, + "language_loss": 0.85093868, + "learning_rate": 0.0005494542360352085, + "loss": 0.8617186, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.28881836, + "step": 2518, + "time_per_iteration": 2.654691457748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081359, + "balance_loss_mlp": 1.05220056, + "epoch": 0.48460946517891496, + "flos": 550798106112.0, + "grad_norm": 0.06803778984218942, + "language_loss": 0.85824656, + "learning_rate": 0.0005491442117821783, + "loss": 0.86906004, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.29125977, + "step": 2519, + "time_per_iteration": 2.703547954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_mlp": 1.0510273, + "epoch": 0.48480184686417854, + "flos": 529123235328.0, + "grad_norm": 0.12066852374350216, + "language_loss": 0.87487119, + "learning_rate": 0.0005488341684496732, + "loss": 0.88568664, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.3046875, + "step": 2520, + "time_per_iteration": 2.6539435386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107692, + "balance_loss_mlp": 1.04757047, + "epoch": 0.4849942285494421, + "flos": 531630148608.0, + "grad_norm": 0.05745701253476237, + "language_loss": 0.91846752, + "learning_rate": 0.0005485241061580624, + "loss": 0.92923677, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.29296875, + "step": 2521, + "time_per_iteration": 2.775069236755371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_mlp": 1.04995275, + "epoch": 0.48518661023470566, + "flos": 722231424000.0, + "grad_norm": 0.05822253141450555, + "language_loss": 0.84573066, + "learning_rate": 0.0005482140250277228, + "loss": 0.8565352, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.3046875, + "step": 2522, + "time_per_iteration": 2.9740066528320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_mlp": 1.05306387, + "epoch": 0.4853789919199692, + "flos": 505843508736.0, + "grad_norm": 0.06368999588379491, + "language_loss": 0.87678063, + "learning_rate": 0.0005479039251790387, + "loss": 0.88760674, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.29492188, + "step": 2523, + "time_per_iteration": 2.6360013484954834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086656, + "balance_loss_mlp": 1.05666256, + "epoch": 0.4855713736052328, + "flos": 660185178624.0, + "grad_norm": 0.060153636482772124, + "language_loss": 0.84925246, + "learning_rate": 0.0005475938067324014, + "loss": 0.8601191, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.29956055, + "step": 2524, + "time_per_iteration": 2.8053042888641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05542803, + "epoch": 0.48576375529049637, + "flos": 436727476224.0, + "grad_norm": 0.059684937302366806, + "language_loss": 0.83693206, + "learning_rate": 0.0005472836698082098, + "loss": 0.84777892, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.29199219, + "step": 2525, + "time_per_iteration": 2.513991355895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085056, + "balance_loss_mlp": 1.05587339, + "epoch": 0.4859561369757599, + "flos": 581424044544.0, + "grad_norm": 0.059033754749834536, + "language_loss": 0.84245414, + "learning_rate": 0.0005469735145268694, + "loss": 0.85330468, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.29174805, + "step": 2526, + "time_per_iteration": 2.758964776992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085929, + "balance_loss_mlp": 1.05712819, + "epoch": 0.4861485186610235, + "flos": 487723175424.0, + "grad_norm": 0.05692033512559974, + "language_loss": 0.80668163, + "learning_rate": 0.0005466633410087933, + "loss": 0.81754094, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.28808594, + "step": 2527, + "time_per_iteration": 2.7483773231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_mlp": 1.01712215, + "epoch": 0.486340900346287, + "flos": 1556893564416.0, + "grad_norm": 0.02025241925229164, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78289819, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11865234, + "step": 2528, + "time_per_iteration": 4.8671183586120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084286, + "balance_loss_mlp": 1.05558062, + "epoch": 0.4865332820315506, + "flos": 482760221184.0, + "grad_norm": 0.060917910127877034, + "language_loss": 0.88050807, + "learning_rate": 0.0005460429397441214, + "loss": 0.89135092, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.28662109, + "step": 2529, + "time_per_iteration": 2.5488078594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083512, + "balance_loss_mlp": 1.05416238, + "epoch": 0.48672566371681414, + "flos": 535548450816.0, + "grad_norm": 0.06933582049293556, + "language_loss": 0.86551011, + "learning_rate": 0.0005457327122383866, + "loss": 0.87634516, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.29321289, + "step": 2530, + "time_per_iteration": 2.6199238300323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018983, + "balance_loss_mlp": 1.00711012, + "epoch": 0.4869180454020777, + "flos": 1411876901376.0, + "grad_norm": 0.01657901033031013, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75655472, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.11865234, + "step": 2531, + "time_per_iteration": 4.810813665390015 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086562, + "balance_loss_mlp": 1.05754662, + "epoch": 0.48711042708734126, + "flos": 572836741632.0, + "grad_norm": 0.0731565805542322, + "language_loss": 0.75476754, + "learning_rate": 0.0005451122040823244, + "loss": 0.76563311, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.28979492, + "step": 2532, + "time_per_iteration": 2.7834720611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_mlp": 1.0543766, + "epoch": 0.48730280877260485, + "flos": 626231665152.0, + "grad_norm": 0.05844807259880667, + "language_loss": 0.7683785, + "learning_rate": 0.0005448019236728997, + "loss": 0.77921844, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.29589844, + "step": 2533, + "time_per_iteration": 2.9007680416107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108612, + "balance_loss_mlp": 1.05789077, + "epoch": 0.48749519045786843, + "flos": 512233818624.0, + "grad_norm": 0.06352012335970622, + "language_loss": 0.84519851, + "learning_rate": 0.0005444916258698255, + "loss": 0.85605973, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.2824707, + "step": 2534, + "time_per_iteration": 2.6479434967041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083901, + "balance_loss_mlp": 1.05450428, + "epoch": 0.48768757214313196, + "flos": 525149678592.0, + "grad_norm": 0.06527387606118956, + "language_loss": 0.85987055, + "learning_rate": 0.0005441813107935704, + "loss": 0.8707096, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.29370117, + "step": 2535, + "time_per_iteration": 2.657701253890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.05359387, + "epoch": 0.48787995382839555, + "flos": 504784300032.0, + "grad_norm": 0.05960574003717953, + "language_loss": 0.85425317, + "learning_rate": 0.0005438709785646091, + "loss": 0.86507541, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.28637695, + "step": 2536, + "time_per_iteration": 2.5686872005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081582, + "balance_loss_mlp": 1.05197084, + "epoch": 0.4880723355136591, + "flos": 574904286720.0, + "grad_norm": 0.0674154398441342, + "language_loss": 0.86857444, + "learning_rate": 0.0005435606293034234, + "loss": 0.87939024, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.29589844, + "step": 2537, + "time_per_iteration": 2.6792654991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108176, + "balance_loss_mlp": 1.05334091, + "epoch": 0.48826471719892267, + "flos": 561172147200.0, + "grad_norm": 0.1079718501079392, + "language_loss": 0.85096419, + "learning_rate": 0.0005432502631305016, + "loss": 0.86178184, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.28417969, + "step": 2538, + "time_per_iteration": 2.6790173053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082462, + "balance_loss_mlp": 1.05366075, + "epoch": 0.4884570988841862, + "flos": 725849980416.0, + "grad_norm": 0.270667674808598, + "language_loss": 0.83102262, + "learning_rate": 0.0005429398801663386, + "loss": 0.84184724, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.28808594, + "step": 2539, + "time_per_iteration": 2.9468812942504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074127, + "balance_loss_mlp": 1.04453969, + "epoch": 0.4886494805694498, + "flos": 430794063360.0, + "grad_norm": 0.06499376102514318, + "language_loss": 0.82999051, + "learning_rate": 0.0005426294805314355, + "loss": 0.8407318, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.29541016, + "step": 2540, + "time_per_iteration": 4.142840623855591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076947, + "balance_loss_mlp": 1.04685867, + "epoch": 0.4888418622547134, + "flos": 672673254912.0, + "grad_norm": 0.055782244803189183, + "language_loss": 0.80130786, + "learning_rate": 0.0005423190643463003, + "loss": 0.81207728, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.30053711, + "step": 2541, + "time_per_iteration": 2.972822427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107237, + "balance_loss_mlp": 1.04237723, + "epoch": 0.4890342439399769, + "flos": 541639014912.0, + "grad_norm": 0.07101662394817357, + "language_loss": 0.83088171, + "learning_rate": 0.0005420086317314473, + "loss": 0.84160542, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.29956055, + "step": 2542, + "time_per_iteration": 2.651425838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072796, + "balance_loss_mlp": 1.04180098, + "epoch": 0.4892266256252405, + "flos": 590380904448.0, + "grad_norm": 0.06479627692425034, + "language_loss": 0.81022084, + "learning_rate": 0.0005416981828073971, + "loss": 0.82094878, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.30957031, + "step": 2543, + "time_per_iteration": 2.775273323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111363, + "balance_loss_mlp": 1.09922981, + "epoch": 0.48941900731050403, + "flos": 1515460008960.0, + "grad_norm": 0.045109342737372694, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78228641, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.14355469, + "step": 2544, + "time_per_iteration": 4.819438219070435 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.0383091, + "epoch": 0.4896113889957676, + "flos": 470327399424.0, + "grad_norm": 0.07868028775989613, + "language_loss": 0.85065794, + "learning_rate": 0.000541077236513819, + "loss": 0.86135024, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.30883789, + "step": 2545, + "time_per_iteration": 2.5191094875335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.03981793, + "epoch": 0.48980377068103115, + "flos": 496310478336.0, + "grad_norm": 0.07130550478628667, + "language_loss": 0.82089663, + "learning_rate": 0.0005407667393853638, + "loss": 0.83161378, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.31884766, + "step": 2546, + "time_per_iteration": 2.617934465408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107245, + "balance_loss_mlp": 1.04043055, + "epoch": 0.48999615236629473, + "flos": 692539628544.0, + "grad_norm": 0.07826700951116618, + "language_loss": 0.8301416, + "learning_rate": 0.0005404562264298569, + "loss": 0.84086609, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.32006836, + "step": 2547, + "time_per_iteration": 2.8667449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.03946531, + "epoch": 0.49018853405155827, + "flos": 541432401408.0, + "grad_norm": 0.06922547112322346, + "language_loss": 0.83528513, + "learning_rate": 0.0005401456977678498, + "loss": 0.8460055, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.32568359, + "step": 2548, + "time_per_iteration": 2.6317896842956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073611, + "balance_loss_mlp": 1.04216361, + "epoch": 0.49038091573682185, + "flos": 695304027648.0, + "grad_norm": 0.06685231557649787, + "language_loss": 0.77518535, + "learning_rate": 0.0005398351535199008, + "loss": 0.78592145, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.31420898, + "step": 2549, + "time_per_iteration": 3.0532455444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.046422, + "epoch": 0.49057329742208544, + "flos": 596614063104.0, + "grad_norm": 0.058433753989977806, + "language_loss": 0.83942944, + "learning_rate": 0.0005395245938065735, + "loss": 0.85020411, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.31030273, + "step": 2550, + "time_per_iteration": 2.788081169128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082711, + "balance_loss_mlp": 1.0515734, + "epoch": 0.490765679107349, + "flos": 513154814976.0, + "grad_norm": 0.08029752654472934, + "language_loss": 0.83026552, + "learning_rate": 0.0005392140187484379, + "loss": 0.84109271, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.3112793, + "step": 2551, + "time_per_iteration": 2.619982957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076344, + "balance_loss_mlp": 1.04577839, + "epoch": 0.49095806079261256, + "flos": 629298782208.0, + "grad_norm": 0.05951944251734202, + "language_loss": 0.89720619, + "learning_rate": 0.0005389034284660701, + "loss": 0.90796959, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.30541992, + "step": 2552, + "time_per_iteration": 2.811321258544922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084609, + "balance_loss_mlp": 1.05349529, + "epoch": 0.4911504424778761, + "flos": 914938122240.0, + "grad_norm": 0.06813620439924545, + "language_loss": 0.82330388, + "learning_rate": 0.000538592823080052, + "loss": 0.83414996, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.31079102, + "step": 2553, + "time_per_iteration": 3.121729612350464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_mlp": 1.05181932, + "epoch": 0.4913428241631397, + "flos": 438716445696.0, + "grad_norm": 0.10151417402847059, + "language_loss": 0.84795117, + "learning_rate": 0.000538282202710971, + "loss": 0.85879219, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.32275391, + "step": 2554, + "time_per_iteration": 2.5441434383392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089823, + "balance_loss_mlp": 1.05782735, + "epoch": 0.4915352058484032, + "flos": 635806955520.0, + "grad_norm": 0.08391436989004458, + "language_loss": 0.81955588, + "learning_rate": 0.000537971567479421, + "loss": 0.83045411, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.31982422, + "step": 2555, + "time_per_iteration": 2.742913246154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088539, + "balance_loss_mlp": 1.05578029, + "epoch": 0.4917275875336668, + "flos": 504272148480.0, + "grad_norm": 0.0678126955236607, + "language_loss": 0.87735516, + "learning_rate": 0.0005376609175060011, + "loss": 0.88824058, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.32763672, + "step": 2556, + "time_per_iteration": 2.5964388847351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.05774164, + "epoch": 0.49191996921893033, + "flos": 654251765760.0, + "grad_norm": 0.06456480219532172, + "language_loss": 0.80659723, + "learning_rate": 0.0005373502529113162, + "loss": 0.81748366, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.30883789, + "step": 2557, + "time_per_iteration": 2.8043599128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092017, + "balance_loss_mlp": 1.06009305, + "epoch": 0.4921123509041939, + "flos": 492101194752.0, + "grad_norm": 0.08818279105065703, + "language_loss": 0.81143486, + "learning_rate": 0.0005370395738159773, + "loss": 0.82235509, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.3190918, + "step": 2558, + "time_per_iteration": 2.6536951065063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086446, + "balance_loss_mlp": 1.05516589, + "epoch": 0.4923047325894575, + "flos": 545907935232.0, + "grad_norm": 0.0699028851556838, + "language_loss": 0.83194804, + "learning_rate": 0.0005367288803406003, + "loss": 0.84281248, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.3125, + "step": 2559, + "time_per_iteration": 2.6608238220214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092322, + "balance_loss_mlp": 1.06075501, + "epoch": 0.49249711427472104, + "flos": 596195043840.0, + "grad_norm": 0.05624800088650225, + "language_loss": 0.81485915, + "learning_rate": 0.0005364181726058073, + "loss": 0.82578236, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.31542969, + "step": 2560, + "time_per_iteration": 2.7245399951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108461, + "balance_loss_mlp": 1.05354452, + "epoch": 0.4926894959599846, + "flos": 497580682752.0, + "grad_norm": 0.0657433103973406, + "language_loss": 0.82255721, + "learning_rate": 0.0005361074507322261, + "loss": 0.83340329, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.31030273, + "step": 2561, + "time_per_iteration": 2.632309913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05359399, + "epoch": 0.49288187764524816, + "flos": 535868545536.0, + "grad_norm": 0.06588348626271129, + "language_loss": 0.81683809, + "learning_rate": 0.000535796714840489, + "loss": 0.82768893, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.31494141, + "step": 2562, + "time_per_iteration": 2.6455063819885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107827, + "balance_loss_mlp": 1.04686987, + "epoch": 0.49307425933051174, + "flos": 641267504640.0, + "grad_norm": 0.07506734855649709, + "language_loss": 0.84067267, + "learning_rate": 0.0005354859650512348, + "loss": 0.85145533, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.3137207, + "step": 2563, + "time_per_iteration": 2.8065779209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075102, + "balance_loss_mlp": 1.04396451, + "epoch": 0.4932666410157753, + "flos": 516000761856.0, + "grad_norm": 0.06295276436461052, + "language_loss": 0.87103295, + "learning_rate": 0.0005351752014851074, + "loss": 0.88178396, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.31103516, + "step": 2564, + "time_per_iteration": 2.573575019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078018, + "balance_loss_mlp": 1.04654717, + "epoch": 0.49345902270103886, + "flos": 601217634816.0, + "grad_norm": 0.06464744293940616, + "language_loss": 0.83104938, + "learning_rate": 0.0005348644242627553, + "loss": 0.84182954, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.31445312, + "step": 2565, + "time_per_iteration": 2.730455160140991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_mlp": 1.0458622, + "epoch": 0.49365140438630245, + "flos": 1492849585152.0, + "grad_norm": 0.030733727476311833, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76345742, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.1328125, + "step": 2566, + "time_per_iteration": 4.939255237579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083828, + "balance_loss_mlp": 1.05290508, + "epoch": 0.493843786071566, + "flos": 629303164416.0, + "grad_norm": 0.06048394989907295, + "language_loss": 0.81127739, + "learning_rate": 0.0005342428293320013, + "loss": 0.82211566, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.30908203, + "step": 2567, + "time_per_iteration": 2.7613086700439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079847, + "balance_loss_mlp": 1.04899621, + "epoch": 0.49403616775682957, + "flos": 617283569664.0, + "grad_norm": 0.0745931351859795, + "language_loss": 0.83762527, + "learning_rate": 0.0005339320118649238, + "loss": 0.84842372, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.30810547, + "step": 2568, + "time_per_iteration": 2.6934940814971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.04763281, + "epoch": 0.4942285494420931, + "flos": 577357355520.0, + "grad_norm": 0.16404827309636982, + "language_loss": 0.86383307, + "learning_rate": 0.000533621181224271, + "loss": 0.87461007, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.30053711, + "step": 2569, + "time_per_iteration": 2.7757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078612, + "balance_loss_mlp": 1.04737914, + "epoch": 0.4944209311273567, + "flos": 629899683840.0, + "grad_norm": 0.06859593656518678, + "language_loss": 0.81795698, + "learning_rate": 0.0005333103375307182, + "loss": 0.8287431, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.31201172, + "step": 2570, + "time_per_iteration": 2.8319950103759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_mlp": 1.043221, + "epoch": 0.4946133128126202, + "flos": 587337108480.0, + "grad_norm": 0.05293986738306163, + "language_loss": 0.86142224, + "learning_rate": 0.0005329994809049451, + "loss": 0.87216723, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.3125, + "step": 2571, + "time_per_iteration": 2.7592415809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075993, + "balance_loss_mlp": 1.04540396, + "epoch": 0.4948056944978838, + "flos": 583437745152.0, + "grad_norm": 0.05076322771290774, + "language_loss": 0.87883997, + "learning_rate": 0.0005326886114676375, + "loss": 0.88959992, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.30541992, + "step": 2572, + "time_per_iteration": 2.9501779079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077876, + "balance_loss_mlp": 1.0463568, + "epoch": 0.49499807618314734, + "flos": 481583149056.0, + "grad_norm": 0.06323365720535751, + "language_loss": 0.87792003, + "learning_rate": 0.0005323777293394854, + "loss": 0.8886987, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.31494141, + "step": 2573, + "time_per_iteration": 2.55361008644104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107249, + "balance_loss_mlp": 1.03975475, + "epoch": 0.4951904578684109, + "flos": 518714288640.0, + "grad_norm": 0.05535210432037286, + "language_loss": 0.81776071, + "learning_rate": 0.000532066834641184, + "loss": 0.82848555, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.32739258, + "step": 2574, + "time_per_iteration": 2.6631722450256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070737, + "balance_loss_mlp": 1.03900313, + "epoch": 0.4953828395536745, + "flos": 535238530560.0, + "grad_norm": 0.06817735062049093, + "language_loss": 0.8516283, + "learning_rate": 0.0005317559274934334, + "loss": 0.86233568, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.31713867, + "step": 2575, + "time_per_iteration": 2.7345352172851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072086, + "balance_loss_mlp": 1.03894639, + "epoch": 0.49557522123893805, + "flos": 528305545728.0, + "grad_norm": 0.05802348124776455, + "language_loss": 0.80394173, + "learning_rate": 0.0005314450080169382, + "loss": 0.81466264, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.33154297, + "step": 2576, + "time_per_iteration": 2.6343159675598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076196, + "balance_loss_mlp": 1.04391456, + "epoch": 0.49576760292420163, + "flos": 427780790784.0, + "grad_norm": 0.07974947058861337, + "language_loss": 0.80607754, + "learning_rate": 0.0005311340763324083, + "loss": 0.81683946, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.32275391, + "step": 2577, + "time_per_iteration": 2.557796001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078498, + "balance_loss_mlp": 1.04557252, + "epoch": 0.49595998460946517, + "flos": 564968203776.0, + "grad_norm": 0.05295897633494548, + "language_loss": 0.82240456, + "learning_rate": 0.0005308231325605578, + "loss": 0.83318955, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.32910156, + "step": 2578, + "time_per_iteration": 2.6799750328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072444, + "balance_loss_mlp": 1.03992367, + "epoch": 0.49615236629472875, + "flos": 702161409024.0, + "grad_norm": 0.05054804003557779, + "language_loss": 0.7645728, + "learning_rate": 0.0005305121768221061, + "loss": 0.77529716, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.32519531, + "step": 2579, + "time_per_iteration": 3.074568748474121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_mlp": 1.01057923, + "epoch": 0.4963447479799923, + "flos": 1440896573952.0, + "grad_norm": 0.02258142627415349, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76063395, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.14453125, + "step": 2580, + "time_per_iteration": 4.807044267654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079853, + "balance_loss_mlp": 1.04749966, + "epoch": 0.49653712966525587, + "flos": 537370094592.0, + "grad_norm": 0.06889886772880317, + "language_loss": 0.9145242, + "learning_rate": 0.0005298902299282984, + "loss": 0.92532271, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.32348633, + "step": 2581, + "time_per_iteration": 2.6145668029785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077544, + "balance_loss_mlp": 1.04561996, + "epoch": 0.4967295113505194, + "flos": 607002660864.0, + "grad_norm": 0.06407878407439609, + "language_loss": 0.84137404, + "learning_rate": 0.0005295792390144033, + "loss": 0.85214949, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.3190918, + "step": 2582, + "time_per_iteration": 2.71272873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083171, + "balance_loss_mlp": 1.05103219, + "epoch": 0.496921893035783, + "flos": 474340243968.0, + "grad_norm": 0.07436197165654145, + "language_loss": 0.83241105, + "learning_rate": 0.0005292682366168294, + "loss": 0.84324276, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.32128906, + "step": 2583, + "time_per_iteration": 2.5284125804901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082483, + "balance_loss_mlp": 1.05079746, + "epoch": 0.4971142747210466, + "flos": 597180059136.0, + "grad_norm": 0.07965760723765093, + "language_loss": 0.79750967, + "learning_rate": 0.0005289572228563181, + "loss": 0.80833459, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.31665039, + "step": 2584, + "time_per_iteration": 2.802370548248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.04862666, + "epoch": 0.4973066564063101, + "flos": 599321797632.0, + "grad_norm": 0.06536047089469768, + "language_loss": 0.83144403, + "learning_rate": 0.000528646197853616, + "loss": 0.84224886, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.31835938, + "step": 2585, + "time_per_iteration": 2.7075467109680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076886, + "balance_loss_mlp": 1.04748917, + "epoch": 0.4974990380915737, + "flos": 649152009216.0, + "grad_norm": 0.11136041462628715, + "language_loss": 0.85364115, + "learning_rate": 0.0005283351617294735, + "loss": 0.86440998, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.29370117, + "step": 2586, + "time_per_iteration": 2.940826892852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_mlp": 1.0143584, + "epoch": 0.49769141977683723, + "flos": 1528490912256.0, + "grad_norm": 0.01813039431029953, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.7766428, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.1328125, + "step": 2587, + "time_per_iteration": 4.996971368789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082207, + "balance_loss_mlp": 1.05278599, + "epoch": 0.4978838014621008, + "flos": 536114446848.0, + "grad_norm": 0.05663819997496981, + "language_loss": 0.86729956, + "learning_rate": 0.0005277130565998916, + "loss": 0.87812161, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.29394531, + "step": 2588, + "time_per_iteration": 2.7356040477752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083607, + "balance_loss_mlp": 1.05401921, + "epoch": 0.49807618314736435, + "flos": 539335742976.0, + "grad_norm": 0.07264241635107661, + "language_loss": 0.82111955, + "learning_rate": 0.0005274019878359748, + "loss": 0.83195567, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.29541016, + "step": 2589, + "time_per_iteration": 2.7199792861938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081352, + "balance_loss_mlp": 1.05102515, + "epoch": 0.49826856483262794, + "flos": 542215185408.0, + "grad_norm": 0.07554474334702437, + "language_loss": 0.86675328, + "learning_rate": 0.0005270909084336628, + "loss": 0.87756681, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.30297852, + "step": 2590, + "time_per_iteration": 2.6305181980133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080877, + "balance_loss_mlp": 1.05045462, + "epoch": 0.4984609465178915, + "flos": 522062212608.0, + "grad_norm": 0.06751539177219479, + "language_loss": 0.89032745, + "learning_rate": 0.0005267798185137276, + "loss": 0.90113628, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.30371094, + "step": 2591, + "time_per_iteration": 2.608088254928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088235, + "balance_loss_mlp": 1.05743146, + "epoch": 0.49865332820315506, + "flos": 574255332864.0, + "grad_norm": 0.0633807963563003, + "language_loss": 0.8924402, + "learning_rate": 0.0005264687181969444, + "loss": 0.90332258, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.30786133, + "step": 2592, + "time_per_iteration": 2.729546308517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088496, + "balance_loss_mlp": 1.05931377, + "epoch": 0.49884570988841864, + "flos": 1013207657472.0, + "grad_norm": 0.06112732681279078, + "language_loss": 0.75084651, + "learning_rate": 0.0005261576076040937, + "loss": 0.76173151, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.29199219, + "step": 2593, + "time_per_iteration": 3.265289783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082947, + "balance_loss_mlp": 1.05281067, + "epoch": 0.4990380915736822, + "flos": 559315597824.0, + "grad_norm": 0.0783599565062882, + "language_loss": 0.84088343, + "learning_rate": 0.0005258464868559591, + "loss": 0.85171294, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.30078125, + "step": 2594, + "time_per_iteration": 2.657191514968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080029, + "balance_loss_mlp": 1.04991674, + "epoch": 0.49923047325894576, + "flos": 498708292608.0, + "grad_norm": 0.0699675322535813, + "language_loss": 0.88836402, + "learning_rate": 0.0005255353560733284, + "loss": 0.89916426, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.30102539, + "step": 2595, + "time_per_iteration": 2.570439100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_mlp": 1.04640186, + "epoch": 0.4994228549442093, + "flos": 1495851273216.0, + "grad_norm": 0.029272008197333242, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76637447, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.12353516, + "step": 2596, + "time_per_iteration": 4.808587074279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084167, + "balance_loss_mlp": 1.05476975, + "epoch": 0.4996152366294729, + "flos": 557090901504.0, + "grad_norm": 0.052965599041123274, + "language_loss": 0.83342099, + "learning_rate": 0.0005249130648877492, + "loss": 0.84426272, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.29370117, + "step": 2597, + "time_per_iteration": 2.7453384399414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010849, + "balance_loss_mlp": 1.05524063, + "epoch": 0.4998076183147364, + "flos": 415372700160.0, + "grad_norm": 0.05960347084431116, + "language_loss": 0.84714389, + "learning_rate": 0.0005246019047263953, + "loss": 0.85799289, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.29614258, + "step": 2598, + "time_per_iteration": 2.488004684448242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091385, + "balance_loss_mlp": 1.06220269, + "epoch": 0.5, + "flos": 467107513344.0, + "grad_norm": 0.06961248878544336, + "language_loss": 0.8223601, + "learning_rate": 0.0005242907350137353, + "loss": 0.83327389, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.29174805, + "step": 2599, + "time_per_iteration": 2.550495147705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092431, + "balance_loss_mlp": 1.06422567, + "epoch": 0.5001923816852636, + "flos": 482460475392.0, + "grad_norm": 0.06813860338073652, + "language_loss": 0.78928339, + "learning_rate": 0.0005239795558705754, + "loss": 0.80020773, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.28198242, + "step": 2600, + "time_per_iteration": 2.656519889831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094846, + "balance_loss_mlp": 1.06492448, + "epoch": 0.5003847633705272, + "flos": 533534750208.0, + "grad_norm": 0.05508549334218052, + "language_loss": 0.89073658, + "learning_rate": 0.0005236683674177264, + "loss": 0.90168506, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.29907227, + "step": 2601, + "time_per_iteration": 2.63960337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098261, + "balance_loss_mlp": 1.06886423, + "epoch": 0.5005771450557907, + "flos": 737473876992.0, + "grad_norm": 0.06683201790232274, + "language_loss": 0.82384604, + "learning_rate": 0.0005233571697760021, + "loss": 0.83482862, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.29345703, + "step": 2602, + "time_per_iteration": 2.859165668487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06814075, + "epoch": 0.5007695267410542, + "flos": 778646974464.0, + "grad_norm": 0.06216601268510387, + "language_loss": 0.83124363, + "learning_rate": 0.0005230459630662203, + "loss": 0.84222066, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.29541016, + "step": 2603, + "time_per_iteration": 2.9592032432556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093592, + "balance_loss_mlp": 1.06479144, + "epoch": 0.5009619084263178, + "flos": 623192251392.0, + "grad_norm": 0.0707725537041266, + "language_loss": 0.81070089, + "learning_rate": 0.0005227347474092022, + "loss": 0.8216368, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.2878418, + "step": 2604, + "time_per_iteration": 2.7389962673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094636, + "balance_loss_mlp": 1.06545365, + "epoch": 0.5011542901115814, + "flos": 530812459008.0, + "grad_norm": 0.05232832672790962, + "language_loss": 0.83514917, + "learning_rate": 0.0005224235229257724, + "loss": 0.84609556, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.29174805, + "step": 2605, + "time_per_iteration": 2.687992811203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.05914283, + "epoch": 0.5013466717968449, + "flos": 527262303744.0, + "grad_norm": 0.056206575952308185, + "language_loss": 0.8630116, + "learning_rate": 0.0005221122897367589, + "loss": 0.87389988, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.29614258, + "step": 2606, + "time_per_iteration": 2.787410259246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088135, + "balance_loss_mlp": 1.05861855, + "epoch": 0.5015390534821085, + "flos": 565750987776.0, + "grad_norm": 0.07695466326694751, + "language_loss": 0.81035262, + "learning_rate": 0.0005218010479629932, + "loss": 0.82123399, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.29467773, + "step": 2607, + "time_per_iteration": 2.6562912464141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091646, + "balance_loss_mlp": 1.06177175, + "epoch": 0.5017314351673721, + "flos": 566430465024.0, + "grad_norm": 0.05799380231795743, + "language_loss": 0.81869501, + "learning_rate": 0.0005214897977253102, + "loss": 0.82961148, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.29833984, + "step": 2608, + "time_per_iteration": 2.6560218334198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084679, + "balance_loss_mlp": 1.05454254, + "epoch": 0.5019238168526357, + "flos": 522018542592.0, + "grad_norm": 0.06343008203006618, + "language_loss": 0.84223098, + "learning_rate": 0.0005211785391445473, + "loss": 0.85307777, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.30102539, + "step": 2609, + "time_per_iteration": 2.726686954498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081377, + "balance_loss_mlp": 1.05202734, + "epoch": 0.5021161985378992, + "flos": 641135084544.0, + "grad_norm": 0.06012661278609564, + "language_loss": 0.79186547, + "learning_rate": 0.0005208672723415467, + "loss": 0.80267924, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.29345703, + "step": 2610, + "time_per_iteration": 2.7944774627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108238, + "balance_loss_mlp": 1.05212474, + "epoch": 0.5023085802231627, + "flos": 591000744960.0, + "grad_norm": 0.06559501481836318, + "language_loss": 0.79065204, + "learning_rate": 0.0005205559974371525, + "loss": 0.80147582, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.30224609, + "step": 2611, + "time_per_iteration": 2.7519257068634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081519, + "balance_loss_mlp": 1.05150175, + "epoch": 0.5025009619084263, + "flos": 472134486528.0, + "grad_norm": 0.05612255210767107, + "language_loss": 0.82192892, + "learning_rate": 0.0005202447145522123, + "loss": 0.83274412, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.29980469, + "step": 2612, + "time_per_iteration": 2.6770236492156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079077, + "balance_loss_mlp": 1.04965591, + "epoch": 0.5026933435936899, + "flos": 454906036224.0, + "grad_norm": 0.05250196134528315, + "language_loss": 0.79193181, + "learning_rate": 0.0005199334238075769, + "loss": 0.80272257, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.29370117, + "step": 2613, + "time_per_iteration": 2.5337562561035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107987, + "balance_loss_mlp": 1.04942441, + "epoch": 0.5028857252789535, + "flos": 491504675328.0, + "grad_norm": 0.0529792440436354, + "language_loss": 0.9204368, + "learning_rate": 0.0005196221253241, + "loss": 0.93123555, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.30419922, + "step": 2614, + "time_per_iteration": 2.564143657684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073952, + "balance_loss_mlp": 1.04276693, + "epoch": 0.503078106964217, + "flos": 625280145408.0, + "grad_norm": 0.06195019445138367, + "language_loss": 0.82918042, + "learning_rate": 0.0005193108192226383, + "loss": 0.83991992, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.31152344, + "step": 2615, + "time_per_iteration": 2.757087230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080642, + "balance_loss_mlp": 1.04990983, + "epoch": 0.5032704886494805, + "flos": 578774536704.0, + "grad_norm": 0.05317989185447873, + "language_loss": 0.8697142, + "learning_rate": 0.000518999505624052, + "loss": 0.88052064, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.30712891, + "step": 2616, + "time_per_iteration": 2.7251224517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078998, + "balance_loss_mlp": 1.04759884, + "epoch": 0.5034628703347441, + "flos": 471481150464.0, + "grad_norm": 0.059314577611761586, + "language_loss": 0.83379316, + "learning_rate": 0.000518688184649203, + "loss": 0.84458327, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.3137207, + "step": 2617, + "time_per_iteration": 2.809063673019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107933, + "balance_loss_mlp": 1.04890776, + "epoch": 0.5036552520200077, + "flos": 489594281472.0, + "grad_norm": 0.08232681701976922, + "language_loss": 0.83759677, + "learning_rate": 0.0005183768564189577, + "loss": 0.8483901, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.30395508, + "step": 2618, + "time_per_iteration": 2.5442681312561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108616, + "balance_loss_mlp": 1.05502236, + "epoch": 0.5038476337052713, + "flos": 493991239680.0, + "grad_norm": 0.10233936422342303, + "language_loss": 0.81248713, + "learning_rate": 0.0005180655210541838, + "loss": 0.8233487, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.31103516, + "step": 2619, + "time_per_iteration": 2.5986533164978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04976153, + "epoch": 0.5040400153905348, + "flos": 600321369600.0, + "grad_norm": 0.10286286455085811, + "language_loss": 0.83096433, + "learning_rate": 0.0005177541786757527, + "loss": 0.84175664, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.29443359, + "step": 2620, + "time_per_iteration": 2.7542781829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04971933, + "epoch": 0.5042323970757984, + "flos": 811178924544.0, + "grad_norm": 0.062363268760676084, + "language_loss": 0.82867718, + "learning_rate": 0.000517442829404538, + "loss": 0.83948314, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.30834961, + "step": 2621, + "time_per_iteration": 2.9758973121643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080161, + "balance_loss_mlp": 1.05000091, + "epoch": 0.504424778761062, + "flos": 626985335808.0, + "grad_norm": 0.06818258917584033, + "language_loss": 0.8721652, + "learning_rate": 0.0005171314733614166, + "loss": 0.88296676, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.30102539, + "step": 2622, + "time_per_iteration": 2.8933780193328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082583, + "balance_loss_mlp": 1.05235183, + "epoch": 0.5046171604463255, + "flos": 515651553792.0, + "grad_norm": 0.06917321427090362, + "language_loss": 0.78315443, + "learning_rate": 0.0005168201106672671, + "loss": 0.79398024, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.30200195, + "step": 2623, + "time_per_iteration": 2.763855457305908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.05093241, + "epoch": 0.504809542131589, + "flos": 527576606208.0, + "grad_norm": 0.06294733427077812, + "language_loss": 0.84776348, + "learning_rate": 0.0005165087414429717, + "loss": 0.85857534, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.30200195, + "step": 2624, + "time_per_iteration": 2.6454148292541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080505, + "balance_loss_mlp": 1.04967785, + "epoch": 0.5050019238168526, + "flos": 553855048704.0, + "grad_norm": 0.07820570667172376, + "language_loss": 0.83597136, + "learning_rate": 0.0005161973658094144, + "loss": 0.84677643, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.30810547, + "step": 2625, + "time_per_iteration": 2.630192756652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075312, + "balance_loss_mlp": 1.04562938, + "epoch": 0.5051943055021162, + "flos": 574486677504.0, + "grad_norm": 0.10754310805258371, + "language_loss": 0.8215518, + "learning_rate": 0.000515885983887482, + "loss": 0.83230495, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.29614258, + "step": 2626, + "time_per_iteration": 2.762484312057495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082022, + "balance_loss_mlp": 1.05179107, + "epoch": 0.5053866871873798, + "flos": 496438516224.0, + "grad_norm": 0.060931372363222436, + "language_loss": 0.84606075, + "learning_rate": 0.0005155745957980636, + "loss": 0.85688096, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.30175781, + "step": 2627, + "time_per_iteration": 2.597625494003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04513431, + "epoch": 0.5055790688726434, + "flos": 501963084288.0, + "grad_norm": 0.060140239439456865, + "language_loss": 0.8829447, + "learning_rate": 0.000515263201662051, + "loss": 0.89370334, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.30688477, + "step": 2628, + "time_per_iteration": 2.676429510116577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081664, + "balance_loss_mlp": 1.05162382, + "epoch": 0.5057714505579068, + "flos": 844844276736.0, + "grad_norm": 0.05201747216110034, + "language_loss": 0.82525623, + "learning_rate": 0.0005149518016003378, + "loss": 0.83607286, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.30004883, + "step": 2629, + "time_per_iteration": 3.1674108505249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.04874492, + "epoch": 0.5059638322431704, + "flos": 497580682752.0, + "grad_norm": 0.12452297981638945, + "language_loss": 0.82290918, + "learning_rate": 0.0005146403957338206, + "loss": 0.83369756, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.30029297, + "step": 2630, + "time_per_iteration": 2.574908494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075266, + "balance_loss_mlp": 1.04415226, + "epoch": 0.506156213928434, + "flos": 617526498816.0, + "grad_norm": 0.054026792513587725, + "language_loss": 0.81795335, + "learning_rate": 0.0005143289841833975, + "loss": 0.82870597, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.31079102, + "step": 2631, + "time_per_iteration": 2.8753445148468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_mlp": 1.04044628, + "epoch": 0.5063485956136976, + "flos": 424624923648.0, + "grad_norm": 0.07665080268010696, + "language_loss": 0.82169271, + "learning_rate": 0.0005140175670699696, + "loss": 0.83241099, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.31347656, + "step": 2632, + "time_per_iteration": 2.606656551361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070677, + "balance_loss_mlp": 1.03989697, + "epoch": 0.5065409772989612, + "flos": 569641586688.0, + "grad_norm": 0.05365826465054309, + "language_loss": 0.82773447, + "learning_rate": 0.0005137061445144395, + "loss": 0.83844125, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.30737305, + "step": 2633, + "time_per_iteration": 2.908146619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.0429641, + "epoch": 0.5067333589842247, + "flos": 628510205952.0, + "grad_norm": 0.06908817272508659, + "language_loss": 0.87031686, + "learning_rate": 0.000513394716637712, + "loss": 0.88106334, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.31665039, + "step": 2634, + "time_per_iteration": 2.804591417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03547585, + "epoch": 0.5069257406694883, + "flos": 1447062741504.0, + "grad_norm": 0.027149993512400487, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80241489, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.14257812, + "step": 2635, + "time_per_iteration": 4.903238296508789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.03977799, + "epoch": 0.5071181223547518, + "flos": 638530656768.0, + "grad_norm": 0.05829667092367474, + "language_loss": 0.80886006, + "learning_rate": 0.0005127718454042958, + "loss": 0.81957495, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.31689453, + "step": 2636, + "time_per_iteration": 2.81962513923645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076357, + "balance_loss_mlp": 1.04467094, + "epoch": 0.5073105040400154, + "flos": 713239658496.0, + "grad_norm": 0.06782185148260642, + "language_loss": 0.84239292, + "learning_rate": 0.0005124604022894269, + "loss": 0.85315657, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.31665039, + "step": 2637, + "time_per_iteration": 2.933143377304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023059, + "balance_loss_mlp": 1.00932586, + "epoch": 0.5075028857252789, + "flos": 1435658605056.0, + "grad_norm": 0.016037159370544805, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78211284, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.13769531, + "step": 2638, + "time_per_iteration": 4.81339168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080028, + "balance_loss_mlp": 1.04786575, + "epoch": 0.5076952674105425, + "flos": 570857946624.0, + "grad_norm": 0.058900205072543066, + "language_loss": 0.83262694, + "learning_rate": 0.0005118375016679325, + "loss": 0.84342724, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.3215332, + "step": 2639, + "time_per_iteration": 2.7476773262023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076278, + "balance_loss_mlp": 1.04490256, + "epoch": 0.5078876490958061, + "flos": 516463451136.0, + "grad_norm": 0.08436499818571505, + "language_loss": 0.80410182, + "learning_rate": 0.0005115260444031382, + "loss": 0.81486464, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.31347656, + "step": 2640, + "time_per_iteration": 2.579087734222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016776, + "balance_loss_mlp": 1.00361574, + "epoch": 0.5080800307810697, + "flos": 1583378620416.0, + "grad_norm": 0.010326775178219767, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79748595, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.13183594, + "step": 2641, + "time_per_iteration": 4.939114809036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077717, + "balance_loss_mlp": 1.04665077, + "epoch": 0.5082724124663333, + "flos": 484965978624.0, + "grad_norm": 0.06392423646026814, + "language_loss": 0.86441147, + "learning_rate": 0.0005109031165700483, + "loss": 0.87518859, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.31030273, + "step": 2642, + "time_per_iteration": 2.572248935699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078927, + "balance_loss_mlp": 1.04809904, + "epoch": 0.5084647941515967, + "flos": 681928450560.0, + "grad_norm": 0.08514760687851525, + "language_loss": 0.83290648, + "learning_rate": 0.0005105916462435945, + "loss": 0.84369576, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.30786133, + "step": 2643, + "time_per_iteration": 2.832653284072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081246, + "balance_loss_mlp": 1.05089569, + "epoch": 0.5086571758368603, + "flos": 548468692992.0, + "grad_norm": 0.05584396132467612, + "language_loss": 0.85012162, + "learning_rate": 0.0005102801718050989, + "loss": 0.86093414, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.30322266, + "step": 2644, + "time_per_iteration": 2.6693568229675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077361, + "balance_loss_mlp": 1.04755831, + "epoch": 0.5088495575221239, + "flos": 563751843840.0, + "grad_norm": 0.07396400679887168, + "language_loss": 0.89154196, + "learning_rate": 0.0005099686933754867, + "loss": 0.9023155, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.29785156, + "step": 2645, + "time_per_iteration": 2.688992977142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080157, + "balance_loss_mlp": 1.05016422, + "epoch": 0.5090419392073875, + "flos": 551132757504.0, + "grad_norm": 0.06521042739972126, + "language_loss": 0.84349567, + "learning_rate": 0.0005096572110756845, + "loss": 0.85429722, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.29956055, + "step": 2646, + "time_per_iteration": 2.694018840789795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080367, + "balance_loss_mlp": 1.05065989, + "epoch": 0.509234320892651, + "flos": 567504230400.0, + "grad_norm": 0.049776737751643374, + "language_loss": 0.85623205, + "learning_rate": 0.0005093457250266205, + "loss": 0.86703575, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.296875, + "step": 2647, + "time_per_iteration": 2.69240665435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085077, + "balance_loss_mlp": 1.05527472, + "epoch": 0.5094267025779146, + "flos": 582339248640.0, + "grad_norm": 0.0639130152108818, + "language_loss": 0.83146644, + "learning_rate": 0.000509034235349224, + "loss": 0.84231722, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.29760742, + "step": 2648, + "time_per_iteration": 2.69409441947937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084417, + "balance_loss_mlp": 1.05499578, + "epoch": 0.5096190842631781, + "flos": 591704953344.0, + "grad_norm": 0.07990516858852505, + "language_loss": 0.81340408, + "learning_rate": 0.0005087227421644266, + "loss": 0.82424831, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.29345703, + "step": 2649, + "time_per_iteration": 2.7338664531707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.05795491, + "epoch": 0.5098114659484417, + "flos": 513307584000.0, + "grad_norm": 0.06481094949829869, + "language_loss": 0.86482179, + "learning_rate": 0.0005084112455931602, + "loss": 0.87570059, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.29907227, + "step": 2650, + "time_per_iteration": 2.5772013664245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085843, + "balance_loss_mlp": 1.05561161, + "epoch": 0.5100038476337053, + "flos": 484389808128.0, + "grad_norm": 0.060404574220966636, + "language_loss": 0.84966755, + "learning_rate": 0.0005080997457563586, + "loss": 0.86052603, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.30200195, + "step": 2651, + "time_per_iteration": 2.5539023876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089212, + "balance_loss_mlp": 1.05895662, + "epoch": 0.5101962293189688, + "flos": 461366157312.0, + "grad_norm": 0.06895787175374923, + "language_loss": 0.79026747, + "learning_rate": 0.0005077882427749569, + "loss": 0.80115962, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.30224609, + "step": 2652, + "time_per_iteration": 2.5036137104034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.06367242, + "epoch": 0.5103886110042324, + "flos": 586760937984.0, + "grad_norm": 0.06232251007114316, + "language_loss": 0.84676695, + "learning_rate": 0.0005074767367698913, + "loss": 0.85770237, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.29833984, + "step": 2653, + "time_per_iteration": 2.6879539489746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088747, + "balance_loss_mlp": 1.05875421, + "epoch": 0.510580992689496, + "flos": 844906885632.0, + "grad_norm": 0.07002300864013745, + "language_loss": 0.83262461, + "learning_rate": 0.0005071652278620988, + "loss": 0.84351206, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.29956055, + "step": 2654, + "time_per_iteration": 3.048330307006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093234, + "balance_loss_mlp": 1.06369376, + "epoch": 0.5107733743747596, + "flos": 658328629248.0, + "grad_norm": 0.077240918193036, + "language_loss": 0.83515394, + "learning_rate": 0.0005068537161725186, + "loss": 0.84608626, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.29492188, + "step": 2655, + "time_per_iteration": 2.7864887714385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088669, + "balance_loss_mlp": 1.05941546, + "epoch": 0.510965756060023, + "flos": 701426677248.0, + "grad_norm": 0.06396168128091786, + "language_loss": 0.84455109, + "learning_rate": 0.0005065422018220893, + "loss": 0.85543782, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.29223633, + "step": 2656, + "time_per_iteration": 2.8869853019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095041, + "balance_loss_mlp": 1.0650475, + "epoch": 0.5111581377452866, + "flos": 559430489088.0, + "grad_norm": 0.0709037558233959, + "language_loss": 0.7998327, + "learning_rate": 0.0005062306849317521, + "loss": 0.81078309, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.29956055, + "step": 2657, + "time_per_iteration": 2.7980425357818604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010852, + "balance_loss_mlp": 1.05484891, + "epoch": 0.5113505194305502, + "flos": 608745729024.0, + "grad_norm": 0.0652959904845647, + "language_loss": 0.83424717, + "learning_rate": 0.0005059191656224487, + "loss": 0.84509915, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.30297852, + "step": 2658, + "time_per_iteration": 2.735557794570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085406, + "balance_loss_mlp": 1.05488813, + "epoch": 0.5115429011158138, + "flos": 534214227456.0, + "grad_norm": 0.05645977889013881, + "language_loss": 0.89198554, + "learning_rate": 0.0005056076440151212, + "loss": 0.90283966, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.3046875, + "step": 2659, + "time_per_iteration": 2.651273012161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136875, + "balance_loss_mlp": 1.12314212, + "epoch": 0.5117352828010774, + "flos": 1361451580416.0, + "grad_norm": 0.05420368374393455, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77424991, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.13769531, + "step": 2660, + "time_per_iteration": 4.8447229862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085456, + "balance_loss_mlp": 1.05689311, + "epoch": 0.5119276644863409, + "flos": 633444046848.0, + "grad_norm": 0.04523661755748661, + "language_loss": 0.87268543, + "learning_rate": 0.0005049845943901691, + "loss": 0.88354003, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.28515625, + "step": 2661, + "time_per_iteration": 2.855107307434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.05092359, + "epoch": 0.5121200461716044, + "flos": 585304468992.0, + "grad_norm": 0.05522645200412479, + "language_loss": 0.86379933, + "learning_rate": 0.0005046730666144338, + "loss": 0.87459898, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.2902832, + "step": 2662, + "time_per_iteration": 2.841339349746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082682, + "balance_loss_mlp": 1.05390453, + "epoch": 0.512312427856868, + "flos": 1032081661440.0, + "grad_norm": 0.05374936854204756, + "language_loss": 0.87915027, + "learning_rate": 0.0005043615370244532, + "loss": 0.8899771, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.2878418, + "step": 2663, + "time_per_iteration": 3.364856004714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_mlp": 1.03728747, + "epoch": 0.5125048095421316, + "flos": 1537257277440.0, + "grad_norm": 0.022479341124125186, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79294169, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.125, + "step": 2664, + "time_per_iteration": 4.635313510894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080439, + "balance_loss_mlp": 1.05163848, + "epoch": 0.5126971912273951, + "flos": 590814480384.0, + "grad_norm": 0.04479435391735135, + "language_loss": 0.85200715, + "learning_rate": 0.0005037384728855425, + "loss": 0.86281157, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.28808594, + "step": 2665, + "time_per_iteration": 2.7995188236236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083297, + "balance_loss_mlp": 1.05356586, + "epoch": 0.5128895729126587, + "flos": 551393215488.0, + "grad_norm": 0.0801864670549744, + "language_loss": 0.84280151, + "learning_rate": 0.0005034269385785075, + "loss": 0.85363448, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.29711914, + "step": 2666, + "time_per_iteration": 2.673332929611206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090699, + "balance_loss_mlp": 1.0623982, + "epoch": 0.5130819545979223, + "flos": 481031709696.0, + "grad_norm": 0.06501156427369086, + "language_loss": 0.84454274, + "learning_rate": 0.0005031154029410168, + "loss": 0.85544968, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.28344727, + "step": 2667, + "time_per_iteration": 2.5442566871643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086564, + "balance_loss_mlp": 1.0577395, + "epoch": 0.5132743362831859, + "flos": 475556603904.0, + "grad_norm": 0.06480382372099369, + "language_loss": 0.86841118, + "learning_rate": 0.0005028038660940197, + "loss": 0.87927675, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.28808594, + "step": 2668, + "time_per_iteration": 2.62888765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077032, + "balance_loss_mlp": 1.04832673, + "epoch": 0.5134667179684494, + "flos": 503559175680.0, + "grad_norm": 0.05084400085528349, + "language_loss": 0.84573722, + "learning_rate": 0.0005024923281584648, + "loss": 0.85650754, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.28662109, + "step": 2669, + "time_per_iteration": 2.6316568851470947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092041, + "balance_loss_mlp": 1.06312072, + "epoch": 0.5136590996537129, + "flos": 503647925760.0, + "grad_norm": 0.05870793453685439, + "language_loss": 0.82656723, + "learning_rate": 0.0005021807892553026, + "loss": 0.83748764, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.28881836, + "step": 2670, + "time_per_iteration": 2.707345724105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093085, + "balance_loss_mlp": 1.06457078, + "epoch": 0.5138514813389765, + "flos": 624330035712.0, + "grad_norm": 0.08829821247143162, + "language_loss": 0.84517181, + "learning_rate": 0.0005018692495054828, + "loss": 0.85610259, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.28540039, + "step": 2671, + "time_per_iteration": 2.758309841156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092768, + "balance_loss_mlp": 1.06399131, + "epoch": 0.5140438630242401, + "flos": 583274801664.0, + "grad_norm": 0.05555500929459815, + "language_loss": 0.80821186, + "learning_rate": 0.0005015577090299561, + "loss": 0.8191396, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.28735352, + "step": 2672, + "time_per_iteration": 2.6883137226104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.06125236, + "epoch": 0.5142362447095037, + "flos": 487683887616.0, + "grad_norm": 0.06705414985084517, + "language_loss": 0.86672199, + "learning_rate": 0.0005012461679496729, + "loss": 0.87762225, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.28759766, + "step": 2673, + "time_per_iteration": 2.5949177742004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092599, + "balance_loss_mlp": 1.0630827, + "epoch": 0.5144286263947672, + "flos": 526601765376.0, + "grad_norm": 0.06054107713253035, + "language_loss": 0.87204134, + "learning_rate": 0.0005009346263855848, + "loss": 0.88296735, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.29467773, + "step": 2674, + "time_per_iteration": 2.6084070205688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093368, + "balance_loss_mlp": 1.06401849, + "epoch": 0.5146210080800308, + "flos": 486252149760.0, + "grad_norm": 0.08912792131396882, + "language_loss": 0.83928424, + "learning_rate": 0.0005006230844586422, + "loss": 0.85021788, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.29345703, + "step": 2675, + "time_per_iteration": 2.8340024948120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094006, + "balance_loss_mlp": 1.06496692, + "epoch": 0.5148133897652943, + "flos": 515622440448.0, + "grad_norm": 0.06185145068902706, + "language_loss": 0.79025733, + "learning_rate": 0.0005003115422897968, + "loss": 0.80119741, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.29052734, + "step": 2676, + "time_per_iteration": 2.7350447177886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088176, + "balance_loss_mlp": 1.05780196, + "epoch": 0.5150057714505579, + "flos": 510963614208.0, + "grad_norm": 0.06610854708750855, + "language_loss": 0.86982405, + "learning_rate": 0.0005, + "loss": 0.88070583, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.30322266, + "step": 2677, + "time_per_iteration": 2.62941837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082976, + "balance_loss_mlp": 1.0535078, + "epoch": 0.5151981531358215, + "flos": 910541164032.0, + "grad_norm": 0.05650592481949535, + "language_loss": 0.7918483, + "learning_rate": 0.0004996884577102033, + "loss": 0.80267811, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.29418945, + "step": 2678, + "time_per_iteration": 3.1128311157226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085723, + "balance_loss_mlp": 1.05577731, + "epoch": 0.515390534821085, + "flos": 471599013888.0, + "grad_norm": 0.05289591163695072, + "language_loss": 0.84550285, + "learning_rate": 0.000499376915541358, + "loss": 0.85636008, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.29907227, + "step": 2679, + "time_per_iteration": 2.709259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_mlp": 1.0510838, + "epoch": 0.5155829165063486, + "flos": 649811137536.0, + "grad_norm": 0.05812477607611756, + "language_loss": 0.81116259, + "learning_rate": 0.0004990653736144155, + "loss": 0.82198453, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.31079102, + "step": 2680, + "time_per_iteration": 2.8433125019073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083796, + "balance_loss_mlp": 1.05318332, + "epoch": 0.5157752981916122, + "flos": 414038476800.0, + "grad_norm": 0.06443376303588658, + "language_loss": 0.8582924, + "learning_rate": 0.0004987538320503271, + "loss": 0.86913037, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.30566406, + "step": 2681, + "time_per_iteration": 2.492128372192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079646, + "balance_loss_mlp": 1.04860437, + "epoch": 0.5159676798768758, + "flos": 553569859584.0, + "grad_norm": 0.06119575969443392, + "language_loss": 0.83057904, + "learning_rate": 0.0004984422909700442, + "loss": 0.84137553, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.31005859, + "step": 2682, + "time_per_iteration": 2.6817965507507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107717, + "balance_loss_mlp": 1.04560328, + "epoch": 0.5161600615621393, + "flos": 586234229760.0, + "grad_norm": 0.06357079240733023, + "language_loss": 0.83849651, + "learning_rate": 0.0004981307504945173, + "loss": 0.84926826, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.31542969, + "step": 2683, + "time_per_iteration": 2.6884219646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04764211, + "epoch": 0.5163524432474028, + "flos": 588568025088.0, + "grad_norm": 0.058627663819765745, + "language_loss": 0.89028186, + "learning_rate": 0.0004978192107446976, + "loss": 0.90106535, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.30664062, + "step": 2684, + "time_per_iteration": 2.7606394290924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074512, + "balance_loss_mlp": 1.04397011, + "epoch": 0.5165448249326664, + "flos": 503642133504.0, + "grad_norm": 0.05338243685455816, + "language_loss": 0.870161, + "learning_rate": 0.0004975076718415353, + "loss": 0.88090611, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.30493164, + "step": 2685, + "time_per_iteration": 2.594937562942505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081075, + "balance_loss_mlp": 1.04991364, + "epoch": 0.51673720661793, + "flos": 416539597824.0, + "grad_norm": 0.06078629774986462, + "language_loss": 0.90568233, + "learning_rate": 0.0004971961339059806, + "loss": 0.91649306, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.3112793, + "step": 2686, + "time_per_iteration": 2.4705729484558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075772, + "balance_loss_mlp": 1.04406273, + "epoch": 0.5169295883031936, + "flos": 598696164864.0, + "grad_norm": 0.067622669815522, + "language_loss": 0.83813852, + "learning_rate": 0.0004968845970589832, + "loss": 0.84889627, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.31689453, + "step": 2687, + "time_per_iteration": 2.6784517765045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108779, + "balance_loss_mlp": 1.05760634, + "epoch": 0.517121969988457, + "flos": 556543844352.0, + "grad_norm": 0.06982295057413529, + "language_loss": 0.84568465, + "learning_rate": 0.0004965730614214926, + "loss": 0.85656255, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.30151367, + "step": 2688, + "time_per_iteration": 2.628742218017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078435, + "balance_loss_mlp": 1.0470829, + "epoch": 0.5173143516737206, + "flos": 469214346240.0, + "grad_norm": 0.06558972316908819, + "language_loss": 0.85422957, + "learning_rate": 0.0004962615271144576, + "loss": 0.86501396, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.31323242, + "step": 2689, + "time_per_iteration": 2.5566818714141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079558, + "balance_loss_mlp": 1.04923093, + "epoch": 0.5175067333589842, + "flos": 719739067392.0, + "grad_norm": 0.32559574880762837, + "language_loss": 0.82639515, + "learning_rate": 0.0004959499942588264, + "loss": 0.83719069, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.30273438, + "step": 2690, + "time_per_iteration": 2.8994317054748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057487, + "balance_loss_mlp": 1.04442203, + "epoch": 0.5176991150442478, + "flos": 1465402834944.0, + "grad_norm": 0.028996752449645728, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79257512, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.13085938, + "step": 2691, + "time_per_iteration": 4.746784687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109471, + "balance_loss_mlp": 1.07830977, + "epoch": 0.5178914967295114, + "flos": 612345346560.0, + "grad_norm": 0.12339515707636219, + "language_loss": 0.85558736, + "learning_rate": 0.0004953269333855661, + "loss": 0.86668211, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.3112793, + "step": 2692, + "time_per_iteration": 2.8191914558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109212, + "balance_loss_mlp": 1.07991028, + "epoch": 0.5180838784147749, + "flos": 500663766528.0, + "grad_norm": 0.07785846219337349, + "language_loss": 0.84034789, + "learning_rate": 0.0004950154056098309, + "loss": 0.85143995, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.29272461, + "step": 2693, + "time_per_iteration": 2.686821222305298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129818, + "balance_loss_mlp": 1.09963465, + "epoch": 0.5182762601000385, + "flos": 688531166208.0, + "grad_norm": 0.07144537100010277, + "language_loss": 0.83820134, + "learning_rate": 0.0004947038797692867, + "loss": 0.84949952, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.30126953, + "step": 2694, + "time_per_iteration": 2.8041090965270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128051, + "balance_loss_mlp": 1.09741426, + "epoch": 0.518468641785302, + "flos": 665315458560.0, + "grad_norm": 0.06183052783496024, + "language_loss": 0.77540803, + "learning_rate": 0.0004943923559848789, + "loss": 0.78668851, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.3059082, + "step": 2695, + "time_per_iteration": 2.797661781311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127895, + "balance_loss_mlp": 1.09756875, + "epoch": 0.5186610234705656, + "flos": 566440639488.0, + "grad_norm": 0.054443821670517534, + "language_loss": 0.90626478, + "learning_rate": 0.0004940808343775515, + "loss": 0.91754371, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.30297852, + "step": 2696, + "time_per_iteration": 2.708075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126092, + "balance_loss_mlp": 1.09593177, + "epoch": 0.5188534051558291, + "flos": 428652324864.0, + "grad_norm": 0.08653085411735448, + "language_loss": 0.82187402, + "learning_rate": 0.0004937693150682479, + "loss": 0.83313495, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.30126953, + "step": 2697, + "time_per_iteration": 2.5607407093048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116261, + "balance_loss_mlp": 1.0861007, + "epoch": 0.5190457868410927, + "flos": 546085435392.0, + "grad_norm": 0.07683001308624603, + "language_loss": 0.76774538, + "learning_rate": 0.0004934577981779107, + "loss": 0.77890801, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.30175781, + "step": 2698, + "time_per_iteration": 2.730090618133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112238, + "balance_loss_mlp": 1.0813148, + "epoch": 0.5192381685263563, + "flos": 548321716224.0, + "grad_norm": 0.05605263998280499, + "language_loss": 0.81117129, + "learning_rate": 0.0004931462838274817, + "loss": 0.82229376, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.30883789, + "step": 2699, + "time_per_iteration": 2.847720146179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109944, + "balance_loss_mlp": 1.07957006, + "epoch": 0.5194305502116199, + "flos": 574993036800.0, + "grad_norm": 0.0574424557407856, + "language_loss": 0.84004086, + "learning_rate": 0.0004928347721379011, + "loss": 0.85114038, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.30322266, + "step": 2700, + "time_per_iteration": 2.6999762058258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102806, + "balance_loss_mlp": 1.07185948, + "epoch": 0.5196229318968835, + "flos": 434019741696.0, + "grad_norm": 0.05483286228362013, + "language_loss": 0.82044077, + "learning_rate": 0.0004925232632301089, + "loss": 0.83146882, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.30908203, + "step": 2701, + "time_per_iteration": 2.560593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098243, + "balance_loss_mlp": 1.06791615, + "epoch": 0.5198153135821469, + "flos": 558607007232.0, + "grad_norm": 0.06379159996009351, + "language_loss": 0.79575932, + "learning_rate": 0.0004922117572250431, + "loss": 0.80674177, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.30273438, + "step": 2702, + "time_per_iteration": 2.6621010303497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094553, + "balance_loss_mlp": 1.0648458, + "epoch": 0.5200076952674105, + "flos": 565397397504.0, + "grad_norm": 0.06234734694325623, + "language_loss": 0.80990833, + "learning_rate": 0.0004919002542436414, + "loss": 0.82085389, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.296875, + "step": 2703, + "time_per_iteration": 2.8583548069000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.06806874, + "epoch": 0.5202000769526741, + "flos": 570916173312.0, + "grad_norm": 0.11086337696641164, + "language_loss": 0.81129456, + "learning_rate": 0.0004915887544068399, + "loss": 0.82227564, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.29980469, + "step": 2704, + "time_per_iteration": 2.6579208374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097204, + "balance_loss_mlp": 1.06787837, + "epoch": 0.5203924586379377, + "flos": 693898583040.0, + "grad_norm": 0.06500287710368027, + "language_loss": 0.78155613, + "learning_rate": 0.0004912772578355736, + "loss": 0.79252815, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.29296875, + "step": 2705, + "time_per_iteration": 2.93152117729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.06395674, + "epoch": 0.5205848403232012, + "flos": 566215087104.0, + "grad_norm": 0.05937288472032104, + "language_loss": 0.82798421, + "learning_rate": 0.000490965764650776, + "loss": 0.83892947, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.30541992, + "step": 2706, + "time_per_iteration": 2.914069414138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090504, + "balance_loss_mlp": 1.06048679, + "epoch": 0.5207772220084648, + "flos": 1213775539200.0, + "grad_norm": 0.08994605713309432, + "language_loss": 0.82582623, + "learning_rate": 0.0004906542749733798, + "loss": 0.83673131, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.29980469, + "step": 2707, + "time_per_iteration": 3.632612943649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.05647707, + "epoch": 0.5209696036937284, + "flos": 592547374080.0, + "grad_norm": 0.05099864574791971, + "language_loss": 0.85112798, + "learning_rate": 0.0004903427889243156, + "loss": 0.86199224, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.29907227, + "step": 2708, + "time_per_iteration": 2.860605001449585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089192, + "balance_loss_mlp": 1.05898452, + "epoch": 0.5211619853789919, + "flos": 522623826432.0, + "grad_norm": 0.058285600596581014, + "language_loss": 0.85712206, + "learning_rate": 0.0004900313066245134, + "loss": 0.86801398, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.30151367, + "step": 2709, + "time_per_iteration": 2.6910862922668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078824, + "balance_loss_mlp": 1.04873538, + "epoch": 0.5213543670642555, + "flos": 502534872576.0, + "grad_norm": 0.06298998318770882, + "language_loss": 0.81023324, + "learning_rate": 0.0004897198281949012, + "loss": 0.8210215, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.30029297, + "step": 2710, + "time_per_iteration": 2.660783290863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085709, + "balance_loss_mlp": 1.0563364, + "epoch": 0.521546748749519, + "flos": 585682790400.0, + "grad_norm": 0.06559869836216795, + "language_loss": 0.77832824, + "learning_rate": 0.0004894083537564057, + "loss": 0.78918535, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.29345703, + "step": 2711, + "time_per_iteration": 2.7276909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079715, + "balance_loss_mlp": 1.04965043, + "epoch": 0.5217391304347826, + "flos": 569833643520.0, + "grad_norm": 0.0684248274147048, + "language_loss": 0.80827081, + "learning_rate": 0.0004890968834299519, + "loss": 0.81906796, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.30029297, + "step": 2712, + "time_per_iteration": 2.738229751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079667, + "balance_loss_mlp": 1.04974508, + "epoch": 0.5219315121200462, + "flos": 542501784576.0, + "grad_norm": 0.061787257592987296, + "language_loss": 0.78808606, + "learning_rate": 0.0004887854173364633, + "loss": 0.79888272, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.29882812, + "step": 2713, + "time_per_iteration": 2.734443426132202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074151, + "balance_loss_mlp": 1.04480171, + "epoch": 0.5221238938053098, + "flos": 550006557696.0, + "grad_norm": 0.05102910961180143, + "language_loss": 0.81491256, + "learning_rate": 0.0004884739555968617, + "loss": 0.82565403, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.29272461, + "step": 2714, + "time_per_iteration": 2.867036819458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_mlp": 1.05559933, + "epoch": 0.5223162754905732, + "flos": 1354373028864.0, + "grad_norm": 0.021468860083039186, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80046767, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.14160156, + "step": 2715, + "time_per_iteration": 4.962530851364136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04559731, + "epoch": 0.5225086571758368, + "flos": 567441621504.0, + "grad_norm": 0.06298546380073215, + "language_loss": 0.86646473, + "learning_rate": 0.0004878510456629992, + "loss": 0.87722689, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.30566406, + "step": 2716, + "time_per_iteration": 2.9603123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081784, + "balance_loss_mlp": 1.05110002, + "epoch": 0.5227010388611004, + "flos": 499914478080.0, + "grad_norm": 0.07025764068668285, + "language_loss": 0.85336471, + "learning_rate": 0.00048753959771057314, + "loss": 0.86418259, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.30639648, + "step": 2717, + "time_per_iteration": 2.632622480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085269, + "balance_loss_mlp": 1.05389357, + "epoch": 0.522893420546364, + "flos": 597372115968.0, + "grad_norm": 0.05729998182106491, + "language_loss": 0.82715809, + "learning_rate": 0.0004872281545957044, + "loss": 0.83801079, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.31347656, + "step": 2718, + "time_per_iteration": 2.7305338382720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078735, + "balance_loss_mlp": 1.04726386, + "epoch": 0.5230858022316276, + "flos": 664278008832.0, + "grad_norm": 0.058019575066879846, + "language_loss": 0.86264348, + "learning_rate": 0.0004869167164393055, + "loss": 0.87343085, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.31445312, + "step": 2719, + "time_per_iteration": 2.9418067932128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_mlp": 1.04472566, + "epoch": 0.5232781839168911, + "flos": 603547047936.0, + "grad_norm": 0.0640312473735956, + "language_loss": 0.89536262, + "learning_rate": 0.00048660528336228793, + "loss": 0.90611863, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.30834961, + "step": 2720, + "time_per_iteration": 2.8314764499664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04506063, + "epoch": 0.5234705656021547, + "flos": 550438723584.0, + "grad_norm": 0.05104764752581424, + "language_loss": 0.89906192, + "learning_rate": 0.0004862938554855606, + "loss": 0.90981793, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.30517578, + "step": 2721, + "time_per_iteration": 2.7912685871124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.04705238, + "epoch": 0.5236629472874182, + "flos": 504026247168.0, + "grad_norm": 0.09225462001304952, + "language_loss": 0.86140561, + "learning_rate": 0.0004859824329300304, + "loss": 0.87217844, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.30200195, + "step": 2722, + "time_per_iteration": 2.5850255489349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081058, + "balance_loss_mlp": 1.0504688, + "epoch": 0.5238553289726818, + "flos": 547394927616.0, + "grad_norm": 0.05217438950511115, + "language_loss": 0.83504456, + "learning_rate": 0.00048567101581660244, + "loss": 0.84585512, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.30541992, + "step": 2723, + "time_per_iteration": 2.6090264320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077999, + "balance_loss_mlp": 1.04712343, + "epoch": 0.5240477106579453, + "flos": 531702931968.0, + "grad_norm": 0.07777816613104971, + "language_loss": 0.8713702, + "learning_rate": 0.00048535960426617956, + "loss": 0.88215029, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.30834961, + "step": 2724, + "time_per_iteration": 2.6143879890441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079989, + "balance_loss_mlp": 1.04966187, + "epoch": 0.5242400923432089, + "flos": 617653126656.0, + "grad_norm": 0.061907794652793086, + "language_loss": 0.81729943, + "learning_rate": 0.0004850481983996621, + "loss": 0.82809931, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.30273438, + "step": 2725, + "time_per_iteration": 2.7439112663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.05174541, + "epoch": 0.5244324740284725, + "flos": 416461022208.0, + "grad_norm": 0.06296520541747418, + "language_loss": 0.87762207, + "learning_rate": 0.0004847367983379492, + "loss": 0.88844043, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.30053711, + "step": 2726, + "time_per_iteration": 2.497286796569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080055, + "balance_loss_mlp": 1.05056226, + "epoch": 0.5246248557137361, + "flos": 626113801728.0, + "grad_norm": 0.09099502950257793, + "language_loss": 0.78826892, + "learning_rate": 0.00048442540420193643, + "loss": 0.79906946, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.29418945, + "step": 2727, + "time_per_iteration": 2.9191126823425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077698, + "balance_loss_mlp": 1.04751396, + "epoch": 0.5248172373989997, + "flos": 1247980746240.0, + "grad_norm": 0.061166777448516674, + "language_loss": 0.79150236, + "learning_rate": 0.0004841140161125182, + "loss": 0.80227935, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.30126953, + "step": 2728, + "time_per_iteration": 3.5845582485198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082892, + "balance_loss_mlp": 1.05306578, + "epoch": 0.5250096190842631, + "flos": 506616118272.0, + "grad_norm": 0.06421237850995067, + "language_loss": 0.84691751, + "learning_rate": 0.0004838026341905857, + "loss": 0.85774648, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.29785156, + "step": 2729, + "time_per_iteration": 2.75872540473938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080479, + "balance_loss_mlp": 1.05010509, + "epoch": 0.5252020007695267, + "flos": 611021297664.0, + "grad_norm": 0.051610102750965434, + "language_loss": 0.85352898, + "learning_rate": 0.00048349125855702844, + "loss": 0.86433375, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.30322266, + "step": 2730, + "time_per_iteration": 2.7679519653320312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108307, + "balance_loss_mlp": 1.05322015, + "epoch": 0.5253943824547903, + "flos": 538970568192.0, + "grad_norm": 0.05904184367240025, + "language_loss": 0.81296933, + "learning_rate": 0.00048317988933273287, + "loss": 0.82380003, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.29785156, + "step": 2731, + "time_per_iteration": 2.7559163570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079843, + "balance_loss_mlp": 1.0495404, + "epoch": 0.5255867641400539, + "flos": 697714988544.0, + "grad_norm": 0.06321650060381495, + "language_loss": 0.8227402, + "learning_rate": 0.00048286852663858367, + "loss": 0.83353865, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.30273438, + "step": 2732, + "time_per_iteration": 2.9430267810821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077146, + "balance_loss_mlp": 1.04710531, + "epoch": 0.5257791458253175, + "flos": 666975568896.0, + "grad_norm": 0.05929618739033729, + "language_loss": 0.84009433, + "learning_rate": 0.000482557170595462, + "loss": 0.85086572, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.30004883, + "step": 2733, + "time_per_iteration": 2.914397954940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083368, + "balance_loss_mlp": 1.05194473, + "epoch": 0.525971527510581, + "flos": 483375679488.0, + "grad_norm": 0.05379595829627383, + "language_loss": 0.87649244, + "learning_rate": 0.0004822458213242475, + "loss": 0.88732612, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.31396484, + "step": 2734, + "time_per_iteration": 2.533350944519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082101, + "balance_loss_mlp": 1.05215609, + "epoch": 0.5261639091958445, + "flos": 829559715840.0, + "grad_norm": 0.15308762813128413, + "language_loss": 0.85928154, + "learning_rate": 0.00048193447894581627, + "loss": 0.87010252, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.29882812, + "step": 2735, + "time_per_iteration": 3.0971109867095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081636, + "balance_loss_mlp": 1.05190539, + "epoch": 0.5263562908811081, + "flos": 520461739008.0, + "grad_norm": 0.059512944610192846, + "language_loss": 0.88020355, + "learning_rate": 0.00048162314358104243, + "loss": 0.89101994, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.296875, + "step": 2736, + "time_per_iteration": 2.619262456893921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.05268502, + "epoch": 0.5265486725663717, + "flos": 574722404352.0, + "grad_norm": 0.05996263826740056, + "language_loss": 0.83247852, + "learning_rate": 0.0004813118153507969, + "loss": 0.84329623, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.29052734, + "step": 2737, + "time_per_iteration": 2.724499464035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079963, + "balance_loss_mlp": 1.06603909, + "epoch": 0.5267410542516352, + "flos": 1546439537664.0, + "grad_norm": 0.02099488410784391, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83527088, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.13964844, + "step": 2738, + "time_per_iteration": 4.7655651569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109097, + "balance_loss_mlp": 1.06135821, + "epoch": 0.5269334359368988, + "flos": 929576701440.0, + "grad_norm": 0.054521404688675106, + "language_loss": 0.83406657, + "learning_rate": 0.00048068918077736163, + "loss": 0.84497625, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.29541016, + "step": 2739, + "time_per_iteration": 3.2117719650268555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087742, + "balance_loss_mlp": 1.05820239, + "epoch": 0.5271258176221624, + "flos": 655079629824.0, + "grad_norm": 0.06027403163408104, + "language_loss": 0.81200749, + "learning_rate": 0.0004803778746759001, + "loss": 0.82288492, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.29492188, + "step": 2740, + "time_per_iteration": 2.883953809738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085865, + "balance_loss_mlp": 1.05627775, + "epoch": 0.527318199307426, + "flos": 542781181440.0, + "grad_norm": 0.07072803117785999, + "language_loss": 0.81773007, + "learning_rate": 0.00048006657619242317, + "loss": 0.82858872, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.29541016, + "step": 2741, + "time_per_iteration": 2.6289987564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108813, + "balance_loss_mlp": 1.05959105, + "epoch": 0.5275105809926895, + "flos": 447629635584.0, + "grad_norm": 0.07275993710061575, + "language_loss": 0.78293514, + "learning_rate": 0.00047975528544778775, + "loss": 0.79381645, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.28491211, + "step": 2742, + "time_per_iteration": 2.6370468139648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_mlp": 1.05685973, + "epoch": 0.527702962677953, + "flos": 578656673280.0, + "grad_norm": 0.08133754904485412, + "language_loss": 0.88532221, + "learning_rate": 0.00047944400256284754, + "loss": 0.89617908, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.28808594, + "step": 2743, + "time_per_iteration": 2.6988437175750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05504286, + "epoch": 0.5278953443632166, + "flos": 652465027584.0, + "grad_norm": 0.061354637447893066, + "language_loss": 0.8008759, + "learning_rate": 0.0004791327276584532, + "loss": 0.81171608, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.28930664, + "step": 2744, + "time_per_iteration": 2.843850612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092207, + "balance_loss_mlp": 1.0627383, + "epoch": 0.5280877260484802, + "flos": 513741159936.0, + "grad_norm": 0.06451817982099761, + "language_loss": 0.80512536, + "learning_rate": 0.00047882146085545264, + "loss": 0.81604743, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.29418945, + "step": 2745, + "time_per_iteration": 2.6313765048980713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059727, + "balance_loss_mlp": 1.04713857, + "epoch": 0.5282801077337438, + "flos": 1444650370560.0, + "grad_norm": 0.01846816151842821, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76462114, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.12597656, + "step": 2746, + "time_per_iteration": 4.961829662322998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080481, + "balance_loss_mlp": 1.05105972, + "epoch": 0.5284724894190073, + "flos": 604580115456.0, + "grad_norm": 0.06475941859576588, + "language_loss": 0.79224515, + "learning_rate": 0.00047819895203700684, + "loss": 0.80304992, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.29394531, + "step": 2747, + "time_per_iteration": 2.727640151977539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_mlp": 1.03618371, + "epoch": 0.5286648711042709, + "flos": 1494172224000.0, + "grad_norm": 0.01378573653182101, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76561111, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.125, + "step": 2748, + "time_per_iteration": 4.70350980758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074595, + "balance_loss_mlp": 1.04469705, + "epoch": 0.5288572527895344, + "flos": 597313889280.0, + "grad_norm": 0.06074589131451646, + "language_loss": 0.88260013, + "learning_rate": 0.0004775764770742277, + "loss": 0.89334607, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.29907227, + "step": 2749, + "time_per_iteration": 2.8722305297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080903, + "balance_loss_mlp": 1.05064785, + "epoch": 0.529049634474798, + "flos": 557041439232.0, + "grad_norm": 0.1215004440050613, + "language_loss": 0.86453164, + "learning_rate": 0.00047726525259079777, + "loss": 0.8753407, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.30224609, + "step": 2750, + "time_per_iteration": 2.782618522644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082004, + "balance_loss_mlp": 1.05203521, + "epoch": 0.5292420161600616, + "flos": 580986086400.0, + "grad_norm": 0.07030365944612293, + "language_loss": 0.88707, + "learning_rate": 0.0004769540369337798, + "loss": 0.89789003, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.29931641, + "step": 2751, + "time_per_iteration": 2.7570507526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078279, + "balance_loss_mlp": 1.04792809, + "epoch": 0.5294343978453251, + "flos": 607989086208.0, + "grad_norm": 0.06134745452443849, + "language_loss": 0.86018121, + "learning_rate": 0.00047664283022399794, + "loss": 0.87096399, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.3034668, + "step": 2752, + "time_per_iteration": 2.8683836460113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070772, + "balance_loss_mlp": 1.04101765, + "epoch": 0.5296267795305887, + "flos": 646226076672.0, + "grad_norm": 0.061305381303338104, + "language_loss": 0.80927074, + "learning_rate": 0.00047633163258227376, + "loss": 0.81997848, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.29711914, + "step": 2753, + "time_per_iteration": 2.889761447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080468, + "balance_loss_mlp": 1.05040383, + "epoch": 0.5298191612158523, + "flos": 559482923520.0, + "grad_norm": 0.06040690928097006, + "language_loss": 0.85472161, + "learning_rate": 0.0004760204441294247, + "loss": 0.86552632, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.30004883, + "step": 2754, + "time_per_iteration": 2.7022712230682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078457, + "balance_loss_mlp": 1.04736757, + "epoch": 0.5300115429011159, + "flos": 513776065536.0, + "grad_norm": 0.08887078297019954, + "language_loss": 0.85966748, + "learning_rate": 0.00047570926498626486, + "loss": 0.87045205, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.31054688, + "step": 2755, + "time_per_iteration": 2.694779396057129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083154, + "balance_loss_mlp": 1.05130148, + "epoch": 0.5302039245863793, + "flos": 672475405824.0, + "grad_norm": 0.0527518505260492, + "language_loss": 0.8147307, + "learning_rate": 0.00047539809527360474, + "loss": 0.82556224, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.31835938, + "step": 2756, + "time_per_iteration": 2.8726418018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086344, + "balance_loss_mlp": 1.05418181, + "epoch": 0.5303963062716429, + "flos": 730507396608.0, + "grad_norm": 0.05719732969355854, + "language_loss": 0.82233423, + "learning_rate": 0.0004750869351122511, + "loss": 0.83319771, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.32128906, + "step": 2757, + "time_per_iteration": 2.989522933959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086301, + "balance_loss_mlp": 1.05397129, + "epoch": 0.5305886879569065, + "flos": 573156836352.0, + "grad_norm": 0.0731965335963944, + "language_loss": 0.81977046, + "learning_rate": 0.00047477578462300685, + "loss": 0.83063352, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.32324219, + "step": 2758, + "time_per_iteration": 2.7154197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108253, + "balance_loss_mlp": 1.05153537, + "epoch": 0.5307810696421701, + "flos": 694988315136.0, + "grad_norm": 0.05716072116198451, + "language_loss": 0.79401624, + "learning_rate": 0.0004744646439266718, + "loss": 0.80484152, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.30957031, + "step": 2759, + "time_per_iteration": 3.010188102722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087952, + "balance_loss_mlp": 1.05719638, + "epoch": 0.5309734513274337, + "flos": 648629683200.0, + "grad_norm": 0.06513852008932475, + "language_loss": 0.92120409, + "learning_rate": 0.000474153513144041, + "loss": 0.93208361, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.30712891, + "step": 2760, + "time_per_iteration": 2.9100866317749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090471, + "balance_loss_mlp": 1.05878544, + "epoch": 0.5311658330126972, + "flos": 604517506560.0, + "grad_norm": 0.05916855301127547, + "language_loss": 0.8678081, + "learning_rate": 0.00047384239239590633, + "loss": 0.87871277, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.31665039, + "step": 2761, + "time_per_iteration": 2.8746495246887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108692, + "balance_loss_mlp": 1.05516267, + "epoch": 0.5313582146979607, + "flos": 557995931136.0, + "grad_norm": 0.06020342742423831, + "language_loss": 0.88611233, + "learning_rate": 0.0004735312818030556, + "loss": 0.8969816, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.31738281, + "step": 2762, + "time_per_iteration": 2.670517921447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092394, + "balance_loss_mlp": 1.06101847, + "epoch": 0.5315505963832243, + "flos": 508152572928.0, + "grad_norm": 0.05825845223399112, + "language_loss": 0.82783639, + "learning_rate": 0.0004732201814862727, + "loss": 0.83876032, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.31347656, + "step": 2763, + "time_per_iteration": 2.7706046104431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05740237, + "epoch": 0.5317429780684879, + "flos": 626132740608.0, + "grad_norm": 0.056446972258987926, + "language_loss": 0.81703943, + "learning_rate": 0.0004729090915663373, + "loss": 0.82791865, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.3046875, + "step": 2764, + "time_per_iteration": 2.8320751190185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088458, + "balance_loss_mlp": 1.0584892, + "epoch": 0.5319353597537514, + "flos": 476506713600.0, + "grad_norm": 0.06421691072563727, + "language_loss": 0.85022444, + "learning_rate": 0.00047259801216402534, + "loss": 0.86110902, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.29931641, + "step": 2765, + "time_per_iteration": 2.5070557594299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087661, + "balance_loss_mlp": 1.05735779, + "epoch": 0.532127741439015, + "flos": 501386913792.0, + "grad_norm": 0.06743519703895742, + "language_loss": 0.86185229, + "learning_rate": 0.00047228694340010845, + "loss": 0.87272882, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.30249023, + "step": 2766, + "time_per_iteration": 2.5665066242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089224, + "balance_loss_mlp": 1.05918312, + "epoch": 0.5323201231242786, + "flos": 1164114063360.0, + "grad_norm": 0.057283919540088275, + "language_loss": 0.85907435, + "learning_rate": 0.0004719758853953544, + "loss": 0.86996663, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.29980469, + "step": 2767, + "time_per_iteration": 3.598590850830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093331, + "balance_loss_mlp": 1.06419635, + "epoch": 0.5325125048095422, + "flos": 378493254144.0, + "grad_norm": 0.07956086058885692, + "language_loss": 0.83881301, + "learning_rate": 0.00047166483827052645, + "loss": 0.84974635, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.29125977, + "step": 2768, + "time_per_iteration": 2.4224319458007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105739, + "balance_loss_mlp": 1.04441977, + "epoch": 0.5327048864948057, + "flos": 1540507534848.0, + "grad_norm": 0.033276153146473426, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78135878, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.12988281, + "step": 2769, + "time_per_iteration": 4.992494583129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05961394, + "epoch": 0.5328972681800692, + "flos": 910877225472.0, + "grad_norm": 0.06372002073291465, + "language_loss": 0.8365072, + "learning_rate": 0.000471042777143682, + "loss": 0.84740394, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.30029297, + "step": 2770, + "time_per_iteration": 3.214010715484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091808, + "balance_loss_mlp": 1.06255412, + "epoch": 0.5330896498653328, + "flos": 473660766720.0, + "grad_norm": 0.05770492360265134, + "language_loss": 0.79306901, + "learning_rate": 0.0004707317633831707, + "loss": 0.80398703, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.29223633, + "step": 2771, + "time_per_iteration": 2.5814082622528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090013, + "balance_loss_mlp": 1.06035328, + "epoch": 0.5332820315505964, + "flos": 501386913792.0, + "grad_norm": 0.06429055642690477, + "language_loss": 0.78255731, + "learning_rate": 0.00047042076098559673, + "loss": 0.79345745, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.29614258, + "step": 2772, + "time_per_iteration": 2.626574754714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096839, + "balance_loss_mlp": 1.06763303, + "epoch": 0.53347441323586, + "flos": 924043368960.0, + "grad_norm": 0.06567346515998468, + "language_loss": 0.73814428, + "learning_rate": 0.00047010977007170174, + "loss": 0.74911261, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.29150391, + "step": 2773, + "time_per_iteration": 3.2639098167419434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089963, + "balance_loss_mlp": 1.06039929, + "epoch": 0.5336667949211235, + "flos": 574185521664.0, + "grad_norm": 0.06353427502994992, + "language_loss": 0.82705283, + "learning_rate": 0.00046979879076222334, + "loss": 0.83795249, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.29516602, + "step": 2774, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094292, + "balance_loss_mlp": 1.0655148, + "epoch": 0.533859176606387, + "flos": 1064233880064.0, + "grad_norm": 0.051161955256212054, + "language_loss": 0.84535086, + "learning_rate": 0.0004694878231778939, + "loss": 0.8562938, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.28759766, + "step": 2775, + "time_per_iteration": 3.37555193901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094093, + "balance_loss_mlp": 1.06471944, + "epoch": 0.5340515582916506, + "flos": 746277967872.0, + "grad_norm": 0.05222814179658164, + "language_loss": 0.8401432, + "learning_rate": 0.0004691768674394423, + "loss": 0.85108411, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.29321289, + "step": 2776, + "time_per_iteration": 2.992685317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_mlp": 1.01251328, + "epoch": 0.5342439399769142, + "flos": 1444905036288.0, + "grad_norm": 0.010305238226800423, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85508353, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.11816406, + "step": 2777, + "time_per_iteration": 4.753941059112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021329, + "balance_loss_mlp": 1.00950325, + "epoch": 0.5344363216621778, + "flos": 1426790495232.0, + "grad_norm": 0.008050007723784799, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77674866, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.11816406, + "step": 2778, + "time_per_iteration": 4.980912923812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091368, + "balance_loss_mlp": 1.0625428, + "epoch": 0.5346287033474413, + "flos": 527355436032.0, + "grad_norm": 0.05741424367086941, + "language_loss": 0.79571807, + "learning_rate": 0.00046824407250656676, + "loss": 0.80663168, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.28808594, + "step": 2779, + "time_per_iteration": 2.641680955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109255, + "balance_loss_mlp": 1.06303382, + "epoch": 0.5348210850327049, + "flos": 510515481600.0, + "grad_norm": 0.05780417685778494, + "language_loss": 0.83320916, + "learning_rate": 0.0004679331653588161, + "loss": 0.84413469, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.29467773, + "step": 2780, + "time_per_iteration": 2.6292784214019775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086907, + "balance_loss_mlp": 1.05741477, + "epoch": 0.5350134667179685, + "flos": 462429748224.0, + "grad_norm": 0.07200473336731207, + "language_loss": 0.8539027, + "learning_rate": 0.0004676222706605147, + "loss": 0.86477172, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.29467773, + "step": 2781, + "time_per_iteration": 2.633302927017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082924, + "balance_loss_mlp": 1.05355036, + "epoch": 0.535205848403232, + "flos": 708566275584.0, + "grad_norm": 0.06052388593462891, + "language_loss": 0.85071301, + "learning_rate": 0.0004673113885323626, + "loss": 0.86154234, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.29321289, + "step": 2782, + "time_per_iteration": 2.8385848999023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108118, + "balance_loss_mlp": 1.05152082, + "epoch": 0.5353982300884956, + "flos": 893855388672.0, + "grad_norm": 0.04759682065371887, + "language_loss": 0.78464407, + "learning_rate": 0.00046700051909505494, + "loss": 0.79545587, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.29638672, + "step": 2783, + "time_per_iteration": 3.17055344581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087683, + "balance_loss_mlp": 1.05730867, + "epoch": 0.5355906117737591, + "flos": 535701219840.0, + "grad_norm": 0.06917760310735488, + "language_loss": 0.83446693, + "learning_rate": 0.000466689662469282, + "loss": 0.84534377, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.3034668, + "step": 2784, + "time_per_iteration": 2.6696882247924805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080736, + "balance_loss_mlp": 1.05048084, + "epoch": 0.5357829934590227, + "flos": 868477593600.0, + "grad_norm": 0.0647182284961505, + "language_loss": 0.84010589, + "learning_rate": 0.00046637881877572917, + "loss": 0.85091329, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.30200195, + "step": 2785, + "time_per_iteration": 3.0897059440612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107764, + "balance_loss_mlp": 1.04783738, + "epoch": 0.5359753751442863, + "flos": 552999481344.0, + "grad_norm": 0.2060352755327757, + "language_loss": 0.84354532, + "learning_rate": 0.0004660679881350764, + "loss": 0.85432178, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.29736328, + "step": 2786, + "time_per_iteration": 2.763195753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_mlp": 1.0236131, + "epoch": 0.5361677568295499, + "flos": 1479687823872.0, + "grad_norm": 0.018061436986608354, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76645112, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.13378906, + "step": 2787, + "time_per_iteration": 5.074235677719116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082567, + "balance_loss_mlp": 1.05223989, + "epoch": 0.5363601385148133, + "flos": 805910432256.0, + "grad_norm": 0.0731464482403051, + "language_loss": 0.77922016, + "learning_rate": 0.0004654463664951667, + "loss": 0.79004586, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.30273438, + "step": 2788, + "time_per_iteration": 2.9973762035369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086105, + "balance_loss_mlp": 1.05647016, + "epoch": 0.5365525202000769, + "flos": 507630246912.0, + "grad_norm": 0.06405642217776768, + "language_loss": 0.83215284, + "learning_rate": 0.0004651355757372447, + "loss": 0.84301388, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.2956543, + "step": 2789, + "time_per_iteration": 2.677021026611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089856, + "balance_loss_mlp": 1.05955315, + "epoch": 0.5367449018853405, + "flos": 528660546048.0, + "grad_norm": 0.05726084062519834, + "language_loss": 0.85958302, + "learning_rate": 0.00046482479851489274, + "loss": 0.87048161, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.30273438, + "step": 2790, + "time_per_iteration": 2.6652121543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089898, + "balance_loss_mlp": 1.05933237, + "epoch": 0.5369372835706041, + "flos": 649614698496.0, + "grad_norm": 0.07271669587233448, + "language_loss": 0.77731752, + "learning_rate": 0.00046451403494876525, + "loss": 0.78821647, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.30541992, + "step": 2791, + "time_per_iteration": 2.897798776626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090037, + "balance_loss_mlp": 1.05882847, + "epoch": 0.5371296652558677, + "flos": 584205972480.0, + "grad_norm": 0.06591879115648011, + "language_loss": 0.84175646, + "learning_rate": 0.0004642032851595111, + "loss": 0.8526569, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.31176758, + "step": 2792, + "time_per_iteration": 2.758230209350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086262, + "balance_loss_mlp": 1.05543458, + "epoch": 0.5373220469411312, + "flos": 595570821120.0, + "grad_norm": 0.05973481987913333, + "language_loss": 0.84753001, + "learning_rate": 0.00046389254926777404, + "loss": 0.8583926, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.30810547, + "step": 2793, + "time_per_iteration": 2.7933902740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086495, + "balance_loss_mlp": 1.05562031, + "epoch": 0.5375144286263948, + "flos": 1113965167104.0, + "grad_norm": 0.05136203618868989, + "language_loss": 0.7824527, + "learning_rate": 0.0004635818273941926, + "loss": 0.79331762, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.30859375, + "step": 2794, + "time_per_iteration": 3.564011335372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088501, + "balance_loss_mlp": 1.05786383, + "epoch": 0.5377068103116583, + "flos": 595319127552.0, + "grad_norm": 0.06685314707582615, + "language_loss": 0.81738025, + "learning_rate": 0.0004632711196593997, + "loss": 0.82826525, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.30639648, + "step": 2795, + "time_per_iteration": 2.7609026432037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089037, + "balance_loss_mlp": 1.05882931, + "epoch": 0.5378991919969219, + "flos": 883839320064.0, + "grad_norm": 0.06695327911218095, + "language_loss": 0.85338485, + "learning_rate": 0.00046296042618402297, + "loss": 0.86427522, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.30175781, + "step": 2796, + "time_per_iteration": 3.079580783843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.05344939, + "epoch": 0.5380915736821854, + "flos": 710344249344.0, + "grad_norm": 0.05461778050704968, + "language_loss": 0.79521048, + "learning_rate": 0.0004626497470886839, + "loss": 0.80605042, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.30517578, + "step": 2797, + "time_per_iteration": 2.956915855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086126, + "balance_loss_mlp": 1.0549171, + "epoch": 0.538283955367449, + "flos": 556721344512.0, + "grad_norm": 0.05348634251654363, + "language_loss": 0.81572765, + "learning_rate": 0.00046233908249399897, + "loss": 0.82658887, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.31176758, + "step": 2798, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087806, + "balance_loss_mlp": 1.05781281, + "epoch": 0.5384763370527126, + "flos": 513218833920.0, + "grad_norm": 0.07296004689367808, + "language_loss": 0.78106725, + "learning_rate": 0.00046202843252057905, + "loss": 0.79194534, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.29956055, + "step": 2799, + "time_per_iteration": 2.615086317062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.05522037, + "epoch": 0.5386687187379762, + "flos": 489490974720.0, + "grad_norm": 0.056459019467486986, + "language_loss": 0.83738667, + "learning_rate": 0.00046171779728902896, + "loss": 0.84824288, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.3034668, + "step": 2800, + "time_per_iteration": 2.613084077835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081602, + "balance_loss_mlp": 1.05025029, + "epoch": 0.5388611004232398, + "flos": 482415395328.0, + "grad_norm": 0.07411133953793157, + "language_loss": 0.86239338, + "learning_rate": 0.000461407176919948, + "loss": 0.87320936, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.31323242, + "step": 2801, + "time_per_iteration": 2.5331709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078309, + "balance_loss_mlp": 1.04838777, + "epoch": 0.5390534821085032, + "flos": 560709457920.0, + "grad_norm": 0.07244428600451569, + "language_loss": 0.85469061, + "learning_rate": 0.00046109657153392997, + "loss": 0.86547375, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.29858398, + "step": 2802, + "time_per_iteration": 2.7376809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081766, + "balance_loss_mlp": 1.05007982, + "epoch": 0.5392458637937668, + "flos": 488132020224.0, + "grad_norm": 0.06487466420670769, + "language_loss": 0.82949483, + "learning_rate": 0.0004607859812515622, + "loss": 0.84031248, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.31665039, + "step": 2803, + "time_per_iteration": 2.601752996444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078317, + "balance_loss_mlp": 1.0476799, + "epoch": 0.5394382454790304, + "flos": 511810417152.0, + "grad_norm": 0.06325281802882306, + "language_loss": 0.87643886, + "learning_rate": 0.00046047540619342667, + "loss": 0.88722193, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.3059082, + "step": 2804, + "time_per_iteration": 2.6036136150360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080625, + "balance_loss_mlp": 1.05056071, + "epoch": 0.539630627164294, + "flos": 567312173568.0, + "grad_norm": 0.0581751577303043, + "language_loss": 0.80008459, + "learning_rate": 0.00046016484648009933, + "loss": 0.81089091, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.30004883, + "step": 2805, + "time_per_iteration": 2.713219165802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080402, + "balance_loss_mlp": 1.05105305, + "epoch": 0.5398230088495575, + "flos": 526203095040.0, + "grad_norm": 0.057792621829283776, + "language_loss": 0.80917501, + "learning_rate": 0.0004598543022321501, + "loss": 0.81997907, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.29296875, + "step": 2806, + "time_per_iteration": 2.631939172744751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082616, + "balance_loss_mlp": 1.05281353, + "epoch": 0.5400153905348211, + "flos": 538493322240.0, + "grad_norm": 0.07612886672081497, + "language_loss": 0.79604518, + "learning_rate": 0.0004595437735701433, + "loss": 0.80687129, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.29736328, + "step": 2807, + "time_per_iteration": 2.701808214187622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_mlp": 1.0507021, + "epoch": 0.5402077722200846, + "flos": 513259531776.0, + "grad_norm": 0.07694205416949251, + "language_loss": 0.83500147, + "learning_rate": 0.00045923326061463623, + "loss": 0.84581584, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.30688477, + "step": 2808, + "time_per_iteration": 2.7844398021698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078771, + "balance_loss_mlp": 1.04725254, + "epoch": 0.5404001539053482, + "flos": 675932428800.0, + "grad_norm": 0.07660553916433042, + "language_loss": 0.81710881, + "learning_rate": 0.00045892276348618113, + "loss": 0.82789654, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.31494141, + "step": 2809, + "time_per_iteration": 2.982339859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053757, + "balance_loss_mlp": 1.04088223, + "epoch": 0.5405925355906118, + "flos": 1553998155264.0, + "grad_norm": 0.023591100709610114, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.7931459, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.12890625, + "step": 2810, + "time_per_iteration": 5.077887296676636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086772, + "balance_loss_mlp": 1.05580163, + "epoch": 0.5407849172758753, + "flos": 647004478464.0, + "grad_norm": 0.07053414384060859, + "language_loss": 0.80792511, + "learning_rate": 0.000458301817192603, + "loss": 0.81879282, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.30957031, + "step": 2811, + "time_per_iteration": 2.8369667530059814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_mlp": 1.02586305, + "epoch": 0.5409772989611389, + "flos": 1406641904640.0, + "grad_norm": 0.019629272648215536, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81880522, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.12890625, + "step": 2812, + "time_per_iteration": 4.8166663646698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079133, + "balance_loss_mlp": 1.04790044, + "epoch": 0.5411696806464025, + "flos": 554102360064.0, + "grad_norm": 0.05474211885389724, + "language_loss": 0.86781704, + "learning_rate": 0.00045768093565369983, + "loss": 0.87860835, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.31201172, + "step": 2813, + "time_per_iteration": 2.7311370372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081245, + "balance_loss_mlp": 1.05077481, + "epoch": 0.5413620623316661, + "flos": 527853030912.0, + "grad_norm": 0.05950457911446913, + "language_loss": 0.8158434, + "learning_rate": 0.0004573705194685646, + "loss": 0.82665586, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.30444336, + "step": 2814, + "time_per_iteration": 2.733198404312134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_mlp": 1.0498848, + "epoch": 0.5415544440169295, + "flos": 598464820224.0, + "grad_norm": 0.06917969261153488, + "language_loss": 0.84880143, + "learning_rate": 0.00045706011983366157, + "loss": 0.85961473, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.31420898, + "step": 2815, + "time_per_iteration": 2.6939895153045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.04683733, + "epoch": 0.5417468257021931, + "flos": 470519456256.0, + "grad_norm": 0.08149095023345422, + "language_loss": 0.82716835, + "learning_rate": 0.00045674973686949847, + "loss": 0.83794552, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.30834961, + "step": 2816, + "time_per_iteration": 2.532838821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.045784, + "epoch": 0.5419392073874567, + "flos": 680477773824.0, + "grad_norm": 0.06493873134640445, + "language_loss": 0.85336345, + "learning_rate": 0.0004564393706965766, + "loss": 0.86413169, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.31005859, + "step": 2817, + "time_per_iteration": 3.013608455657959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.04578137, + "epoch": 0.5421315890727203, + "flos": 462134384640.0, + "grad_norm": 0.06666383117391396, + "language_loss": 0.81068963, + "learning_rate": 0.00045612902143539116, + "loss": 0.82146215, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.31469727, + "step": 2818, + "time_per_iteration": 2.605372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070647, + "balance_loss_mlp": 1.03998637, + "epoch": 0.5423239707579839, + "flos": 436727476224.0, + "grad_norm": 0.07813750406706815, + "language_loss": 0.81324685, + "learning_rate": 0.00045581868920642986, + "loss": 0.82395327, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.30615234, + "step": 2819, + "time_per_iteration": 2.4960100650787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.04709649, + "epoch": 0.5425163524432474, + "flos": 458067695616.0, + "grad_norm": 0.07920473504276467, + "language_loss": 0.79243749, + "learning_rate": 0.00045550837413017457, + "loss": 0.80321598, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.30712891, + "step": 2820, + "time_per_iteration": 2.684987783432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072493, + "balance_loss_mlp": 1.04188037, + "epoch": 0.542708734128511, + "flos": 419267681280.0, + "grad_norm": 0.056801171387635116, + "language_loss": 0.85060829, + "learning_rate": 0.0004551980763271005, + "loss": 0.86133325, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.30566406, + "step": 2821, + "time_per_iteration": 2.6912834644317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075835, + "balance_loss_mlp": 1.04529333, + "epoch": 0.5429011158137745, + "flos": 678142568448.0, + "grad_norm": 0.05882616642734503, + "language_loss": 0.83789319, + "learning_rate": 0.0004548877959176756, + "loss": 0.84865159, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.30493164, + "step": 2822, + "time_per_iteration": 2.8441174030303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080776, + "balance_loss_mlp": 1.04985332, + "epoch": 0.5430934974990381, + "flos": 540664174080.0, + "grad_norm": 0.06945933761570218, + "language_loss": 0.86118329, + "learning_rate": 0.00045457753302236166, + "loss": 0.8719911, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.30908203, + "step": 2823, + "time_per_iteration": 2.6186442375183105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107393, + "balance_loss_mlp": 1.04312599, + "epoch": 0.5432858791843016, + "flos": 658175860224.0, + "grad_norm": 0.07165023342281863, + "language_loss": 0.87164384, + "learning_rate": 0.00045426728776161353, + "loss": 0.88238311, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.30761719, + "step": 2824, + "time_per_iteration": 2.7953178882598877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.05092704, + "epoch": 0.5434782608695652, + "flos": 531678200832.0, + "grad_norm": 0.05974352124313591, + "language_loss": 0.81803101, + "learning_rate": 0.00045395706025587863, + "loss": 0.8288421, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.30151367, + "step": 2825, + "time_per_iteration": 2.612980604171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076561, + "balance_loss_mlp": 1.04599547, + "epoch": 0.5436706425548288, + "flos": 608219020800.0, + "grad_norm": 0.07443979134593931, + "language_loss": 0.8264693, + "learning_rate": 0.00045364685062559843, + "loss": 0.83723497, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.30541992, + "step": 2826, + "time_per_iteration": 2.828479051589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04630804, + "epoch": 0.5438630242400924, + "flos": 705081549312.0, + "grad_norm": 0.061142502150282975, + "language_loss": 0.91168308, + "learning_rate": 0.0004533366589912067, + "loss": 0.92245257, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.30615234, + "step": 2827, + "time_per_iteration": 2.970296621322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075368, + "balance_loss_mlp": 1.04599524, + "epoch": 0.544055405925356, + "flos": 856073885184.0, + "grad_norm": 0.07414497131093437, + "language_loss": 0.77502602, + "learning_rate": 0.0004530264854731306, + "loss": 0.78577971, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.29370117, + "step": 2828, + "time_per_iteration": 3.022944450378418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085089, + "balance_loss_mlp": 1.05521488, + "epoch": 0.5442477876106194, + "flos": 571483579392.0, + "grad_norm": 0.048879345895653556, + "language_loss": 0.84054667, + "learning_rate": 0.00045271633019179034, + "loss": 0.85139751, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.29833984, + "step": 2829, + "time_per_iteration": 2.7760679721832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086373, + "balance_loss_mlp": 1.05707121, + "epoch": 0.544440169295883, + "flos": 625246649856.0, + "grad_norm": 0.06402410848819869, + "language_loss": 0.87688053, + "learning_rate": 0.0004524061932675986, + "loss": 0.88774425, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.29248047, + "step": 2830, + "time_per_iteration": 2.830350637435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086958, + "balance_loss_mlp": 1.05691731, + "epoch": 0.5446325509811466, + "flos": 835896181248.0, + "grad_norm": 0.06453180665575306, + "language_loss": 0.86766136, + "learning_rate": 0.00045209607482096125, + "loss": 0.87853098, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.30029297, + "step": 2831, + "time_per_iteration": 3.0085608959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082113, + "balance_loss_mlp": 1.05192947, + "epoch": 0.5448249326664102, + "flos": 483129778176.0, + "grad_norm": 0.06460698711812493, + "language_loss": 0.84066617, + "learning_rate": 0.0004517859749722772, + "loss": 0.85148734, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.30126953, + "step": 2832, + "time_per_iteration": 2.6471612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078289, + "balance_loss_mlp": 1.04803348, + "epoch": 0.5450173143516738, + "flos": 560799618048.0, + "grad_norm": 0.09569427913676506, + "language_loss": 0.78785688, + "learning_rate": 0.0004514758938419376, + "loss": 0.79863977, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.30200195, + "step": 2833, + "time_per_iteration": 2.8068594932556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_mlp": 1.02627981, + "epoch": 0.5452096960369373, + "flos": 1469632467456.0, + "grad_norm": 0.016706116470577157, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77958739, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.11865234, + "step": 2834, + "time_per_iteration": 4.907236814498901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079515, + "balance_loss_mlp": 1.04871142, + "epoch": 0.5454020777222008, + "flos": 464827562496.0, + "grad_norm": 0.06561437539450005, + "language_loss": 0.83799005, + "learning_rate": 0.00045085578821782175, + "loss": 0.84878516, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.30761719, + "step": 2835, + "time_per_iteration": 2.538837194442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_mlp": 1.02082336, + "epoch": 0.5455944594074644, + "flos": 1468921056768.0, + "grad_norm": 0.016611239115941395, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77167535, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.11962891, + "step": 2836, + "time_per_iteration": 4.947264671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.04765117, + "epoch": 0.545786841092728, + "flos": 532900353024.0, + "grad_norm": 0.05618000101860937, + "language_loss": 0.8099249, + "learning_rate": 0.00045023575891159866, + "loss": 0.82071036, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.30859375, + "step": 2837, + "time_per_iteration": 2.7390823364257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025548, + "balance_loss_mlp": 1.01348448, + "epoch": 0.5459792227779915, + "flos": 1351633360896.0, + "grad_norm": 0.010465474292049673, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75789356, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.12060547, + "step": 2838, + "time_per_iteration": 4.913767576217651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080178, + "balance_loss_mlp": 1.05025697, + "epoch": 0.5461716044632551, + "flos": 637584929280.0, + "grad_norm": 0.053509390521789255, + "language_loss": 0.78084177, + "learning_rate": 0.0004496158068861354, + "loss": 0.7916435, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.29882812, + "step": 2839, + "time_per_iteration": 2.816080331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085641, + "balance_loss_mlp": 1.05548143, + "epoch": 0.5463639861485187, + "flos": 602458725888.0, + "grad_norm": 0.05135655646470402, + "language_loss": 0.80302298, + "learning_rate": 0.00044930586015455207, + "loss": 0.81387937, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.30102539, + "step": 2840, + "time_per_iteration": 2.79626727104187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087336, + "balance_loss_mlp": 1.05717611, + "epoch": 0.5465563678337823, + "flos": 642208849920.0, + "grad_norm": 0.05566707414242676, + "language_loss": 0.89057064, + "learning_rate": 0.000448995933104179, + "loss": 0.90144402, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.30102539, + "step": 2841, + "time_per_iteration": 2.8602969646453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080566, + "balance_loss_mlp": 1.0502634, + "epoch": 0.5467487495190458, + "flos": 613852687872.0, + "grad_norm": 0.07080900039808569, + "language_loss": 0.80240697, + "learning_rate": 0.00044868602585534077, + "loss": 0.81321263, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.30297852, + "step": 2842, + "time_per_iteration": 2.9035747051239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078755, + "balance_loss_mlp": 1.04778409, + "epoch": 0.5469411312043093, + "flos": 460957312512.0, + "grad_norm": 0.061738359719804514, + "language_loss": 0.88582397, + "learning_rate": 0.0004483761385283541, + "loss": 0.89661151, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.30932617, + "step": 2843, + "time_per_iteration": 2.5193030834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074267, + "balance_loss_mlp": 1.04448807, + "epoch": 0.5471335128895729, + "flos": 560930628096.0, + "grad_norm": 0.05447472334615201, + "language_loss": 0.81464523, + "learning_rate": 0.0004480662712435281, + "loss": 0.8253879, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.29736328, + "step": 2844, + "time_per_iteration": 2.731069326400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107206, + "balance_loss_mlp": 1.04254341, + "epoch": 0.5473258945748365, + "flos": 518437863936.0, + "grad_norm": 0.060615817798691185, + "language_loss": 0.8824929, + "learning_rate": 0.0004477564241211635, + "loss": 0.89321351, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.29467773, + "step": 2845, + "time_per_iteration": 2.5875682830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079224, + "balance_loss_mlp": 1.04880142, + "epoch": 0.5475182762601001, + "flos": 433600722432.0, + "grad_norm": 0.0822753996114188, + "language_loss": 0.86914051, + "learning_rate": 0.0004474465972815541, + "loss": 0.87993276, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.30371094, + "step": 2846, + "time_per_iteration": 2.4777207374572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074275, + "balance_loss_mlp": 1.04406786, + "epoch": 0.5477106579453636, + "flos": 511308440064.0, + "grad_norm": 0.05432348028770475, + "language_loss": 0.87747157, + "learning_rate": 0.000447136790844985, + "loss": 0.88821435, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.30151367, + "step": 2847, + "time_per_iteration": 2.6856186389923096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076945, + "balance_loss_mlp": 1.04623675, + "epoch": 0.5479030396306271, + "flos": 675606541824.0, + "grad_norm": 0.055626256163384374, + "language_loss": 0.81023288, + "learning_rate": 0.00044682700493173385, + "loss": 0.8210023, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.30664062, + "step": 2848, + "time_per_iteration": 2.8167617321014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082333, + "balance_loss_mlp": 1.05229259, + "epoch": 0.5480954213158907, + "flos": 875720498688.0, + "grad_norm": 0.06111415202222153, + "language_loss": 0.80075896, + "learning_rate": 0.00044651723966207004, + "loss": 0.81158233, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.29980469, + "step": 2849, + "time_per_iteration": 3.0959999561309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084207, + "balance_loss_mlp": 1.05435705, + "epoch": 0.5482878030011543, + "flos": 621715433472.0, + "grad_norm": 0.05903862339795778, + "language_loss": 0.78441715, + "learning_rate": 0.00044620749515625536, + "loss": 0.79525924, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.2980957, + "step": 2850, + "time_per_iteration": 2.7892706394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108025, + "balance_loss_mlp": 1.05001831, + "epoch": 0.5484801846864179, + "flos": 496946285568.0, + "grad_norm": 0.0673362889441577, + "language_loss": 0.84918725, + "learning_rate": 0.00044589777153454334, + "loss": 0.85998976, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.30175781, + "step": 2851, + "time_per_iteration": 2.771003007888794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083219, + "balance_loss_mlp": 1.05241561, + "epoch": 0.5486725663716814, + "flos": 442202582016.0, + "grad_norm": 0.05413608872240749, + "language_loss": 0.83428276, + "learning_rate": 0.00044558806891717895, + "loss": 0.84511489, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.30761719, + "step": 2852, + "time_per_iteration": 2.499460220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088115, + "balance_loss_mlp": 1.0584085, + "epoch": 0.548864948056945, + "flos": 654867224064.0, + "grad_norm": 0.06786065051926819, + "language_loss": 0.79808474, + "learning_rate": 0.0004452783874243998, + "loss": 0.80896592, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.29663086, + "step": 2853, + "time_per_iteration": 2.8307228088378906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084659, + "balance_loss_mlp": 1.05497599, + "epoch": 0.5490573297422086, + "flos": 545760958464.0, + "grad_norm": 0.06292410009946192, + "language_loss": 0.84795368, + "learning_rate": 0.00044496872717643475, + "loss": 0.85880023, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.29638672, + "step": 2854, + "time_per_iteration": 2.6626110076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051353, + "balance_loss_mlp": 1.03819215, + "epoch": 0.5492497114274721, + "flos": 1589450245632.0, + "grad_norm": 0.03322747605543158, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78140646, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.13183594, + "step": 2855, + "time_per_iteration": 4.957303285598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.05448246, + "epoch": 0.5494420931127356, + "flos": 750567237120.0, + "grad_norm": 0.04982994122271322, + "language_loss": 0.81768692, + "learning_rate": 0.0004443494708958217, + "loss": 0.82852638, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.29443359, + "step": 2856, + "time_per_iteration": 3.005343437194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088352, + "balance_loss_mlp": 1.0585736, + "epoch": 0.5496344747979992, + "flos": 625704956928.0, + "grad_norm": 0.04689474861444355, + "language_loss": 0.80522525, + "learning_rate": 0.0004440398751035906, + "loss": 0.8161087, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.29736328, + "step": 2857, + "time_per_iteration": 2.868595838546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095367, + "balance_loss_mlp": 1.06659007, + "epoch": 0.5498268564832628, + "flos": 522859553280.0, + "grad_norm": 0.07030492887566664, + "language_loss": 0.83409548, + "learning_rate": 0.00044373030103700645, + "loss": 0.8450492, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.28759766, + "step": 2858, + "time_per_iteration": 2.5910122394561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094102, + "balance_loss_mlp": 1.06508696, + "epoch": 0.5500192381685264, + "flos": 604290544128.0, + "grad_norm": 0.06946154028242445, + "language_loss": 0.79413795, + "learning_rate": 0.000443420748816257, + "loss": 0.80507904, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.28979492, + "step": 2859, + "time_per_iteration": 2.825594663619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096344, + "balance_loss_mlp": 1.06706619, + "epoch": 0.55021161985379, + "flos": 520246361088.0, + "grad_norm": 0.06600867884275338, + "language_loss": 0.78576386, + "learning_rate": 0.0004431112185615208, + "loss": 0.79672724, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.29248047, + "step": 2860, + "time_per_iteration": 2.786670446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090723, + "balance_loss_mlp": 1.06154037, + "epoch": 0.5504040015390534, + "flos": 489426955776.0, + "grad_norm": 0.06889565209263777, + "language_loss": 0.79788846, + "learning_rate": 0.00044280171039296845, + "loss": 0.80879569, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.29174805, + "step": 2861, + "time_per_iteration": 2.634674072265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090878, + "balance_loss_mlp": 1.0620054, + "epoch": 0.550596383224317, + "flos": 575519745024.0, + "grad_norm": 0.05438680375258401, + "language_loss": 0.88480103, + "learning_rate": 0.0004424922244307616, + "loss": 0.89570987, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.28857422, + "step": 2862, + "time_per_iteration": 2.6849331855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093044, + "balance_loss_mlp": 1.06328964, + "epoch": 0.5507887649095806, + "flos": 642149213184.0, + "grad_norm": 0.06984640427248112, + "language_loss": 0.81865609, + "learning_rate": 0.00044218276079505315, + "loss": 0.82958651, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.29711914, + "step": 2863, + "time_per_iteration": 2.9186837673187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.06289792, + "epoch": 0.5509811465948442, + "flos": 531589450752.0, + "grad_norm": 0.06524866768544495, + "language_loss": 0.74926496, + "learning_rate": 0.0004418733196059876, + "loss": 0.76019078, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.29663086, + "step": 2864, + "time_per_iteration": 2.74560546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084987, + "balance_loss_mlp": 1.05635333, + "epoch": 0.5511735282801077, + "flos": 654439440384.0, + "grad_norm": 0.056184402553186, + "language_loss": 0.79785758, + "learning_rate": 0.0004415639009837008, + "loss": 0.80870748, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.28637695, + "step": 2865, + "time_per_iteration": 2.81969952583313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087597, + "balance_loss_mlp": 1.05908251, + "epoch": 0.5513659099653713, + "flos": 529222159872.0, + "grad_norm": 0.061494004909324176, + "language_loss": 0.81620675, + "learning_rate": 0.00044125450504831955, + "loss": 0.82708275, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.28540039, + "step": 2866, + "time_per_iteration": 2.739954948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085385, + "balance_loss_mlp": 1.05586863, + "epoch": 0.5515582916506349, + "flos": 554594162688.0, + "grad_norm": 0.07127737838687996, + "language_loss": 0.81880403, + "learning_rate": 0.0004409451319199622, + "loss": 0.82965791, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.29467773, + "step": 2867, + "time_per_iteration": 2.6776282787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077009, + "balance_loss_mlp": 1.0484705, + "epoch": 0.5517506733358984, + "flos": 735067298304.0, + "grad_norm": 0.06535442843844029, + "language_loss": 0.84516299, + "learning_rate": 0.0004406357817187381, + "loss": 0.85593313, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.28540039, + "step": 2868, + "time_per_iteration": 3.002542495727539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081101, + "balance_loss_mlp": 1.05170417, + "epoch": 0.551943055021162, + "flos": 1114861432320.0, + "grad_norm": 0.05667738365358171, + "language_loss": 0.81411439, + "learning_rate": 0.0004403264545647474, + "loss": 0.82492542, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.29370117, + "step": 2869, + "time_per_iteration": 3.523195505142212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080839, + "balance_loss_mlp": 1.05196702, + "epoch": 0.5521354367064255, + "flos": 544092083712.0, + "grad_norm": 0.062383704003679354, + "language_loss": 0.8429901, + "learning_rate": 0.00044001715057808154, + "loss": 0.85379851, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.28808594, + "step": 2870, + "time_per_iteration": 2.759244680404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085176, + "balance_loss_mlp": 1.05496836, + "epoch": 0.5523278183916891, + "flos": 935889845760.0, + "grad_norm": 0.05408626919612749, + "language_loss": 0.81631571, + "learning_rate": 0.0004397078698788232, + "loss": 0.82716751, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.30175781, + "step": 2871, + "time_per_iteration": 3.2238638401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_mlp": 1.0167197, + "epoch": 0.5525202000769527, + "flos": 1465117645824.0, + "grad_norm": 0.017765030651381717, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81471765, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.12695312, + "step": 2872, + "time_per_iteration": 4.941680431365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084518, + "balance_loss_mlp": 1.05442953, + "epoch": 0.5527125817622163, + "flos": 489554993664.0, + "grad_norm": 0.06021715836391359, + "language_loss": 0.77858603, + "learning_rate": 0.00043908937882281343, + "loss": 0.78943121, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.30029297, + "step": 2873, + "time_per_iteration": 2.6475777626037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_mlp": 1.04845667, + "epoch": 0.5529049634474797, + "flos": 634606562304.0, + "grad_norm": 0.05779342240658392, + "language_loss": 0.82503784, + "learning_rate": 0.0004387801687061814, + "loss": 0.83582854, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.30566406, + "step": 2874, + "time_per_iteration": 2.8554017543792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078914, + "balance_loss_mlp": 1.04963589, + "epoch": 0.5530973451327433, + "flos": 580986086400.0, + "grad_norm": 0.0636526113513214, + "language_loss": 0.80157411, + "learning_rate": 0.0004384709823571958, + "loss": 0.81236321, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.29223633, + "step": 2875, + "time_per_iteration": 2.749535322189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076752, + "balance_loss_mlp": 1.04764128, + "epoch": 0.5532897268180069, + "flos": 1122030144000.0, + "grad_norm": 0.06015536663517987, + "language_loss": 0.82898968, + "learning_rate": 0.0004381618198958932, + "loss": 0.8397572, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.29052734, + "step": 2876, + "time_per_iteration": 3.518888235092163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0494318, + "epoch": 0.5534821085032705, + "flos": 636965088768.0, + "grad_norm": 0.05611364502947972, + "language_loss": 0.83295852, + "learning_rate": 0.00043785268144230137, + "loss": 0.84374702, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.29418945, + "step": 2877, + "time_per_iteration": 2.8977479934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078991, + "balance_loss_mlp": 1.04916453, + "epoch": 0.5536744901885341, + "flos": 570837597696.0, + "grad_norm": 0.07334940017367843, + "language_loss": 0.82020825, + "learning_rate": 0.00043754356711643837, + "loss": 0.83099812, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.29785156, + "step": 2878, + "time_per_iteration": 2.6804401874542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080304, + "balance_loss_mlp": 1.04964316, + "epoch": 0.5538668718737976, + "flos": 595418052096.0, + "grad_norm": 0.0625181232423103, + "language_loss": 0.84172422, + "learning_rate": 0.0004372344770383132, + "loss": 0.85252726, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.30615234, + "step": 2879, + "time_per_iteration": 2.80837345123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077442, + "balance_loss_mlp": 1.04766345, + "epoch": 0.5540592535590612, + "flos": 532324182528.0, + "grad_norm": 0.05711228581787917, + "language_loss": 0.82837629, + "learning_rate": 0.00043692541132792507, + "loss": 0.83915067, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.29736328, + "step": 2880, + "time_per_iteration": 2.7545833587646484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04738569, + "epoch": 0.5542516352443247, + "flos": 412398715392.0, + "grad_norm": 0.06446598855551679, + "language_loss": 0.83125883, + "learning_rate": 0.00043661637010526384, + "loss": 0.84202665, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.29370117, + "step": 2881, + "time_per_iteration": 2.4907724857330322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072171, + "balance_loss_mlp": 1.04139102, + "epoch": 0.5544440169295883, + "flos": 547341083136.0, + "grad_norm": 0.05841414515956175, + "language_loss": 0.82957321, + "learning_rate": 0.00043630735349031025, + "loss": 0.8402949, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.30737305, + "step": 2882, + "time_per_iteration": 2.6922152042388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071624, + "balance_loss_mlp": 1.04101133, + "epoch": 0.5546363986148518, + "flos": 621518994432.0, + "grad_norm": 0.05422763519754927, + "language_loss": 0.81816816, + "learning_rate": 0.00043599836160303495, + "loss": 0.82888442, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.30566406, + "step": 2883, + "time_per_iteration": 2.861325979232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069587, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5548287803001154, + "flos": 704972450304.0, + "grad_norm": 0.05987077775612136, + "language_loss": 0.77311337, + "learning_rate": 0.0004356893945633995, + "loss": 0.78380919, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.30395508, + "step": 2884, + "time_per_iteration": 2.964421510696411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070587, + "balance_loss_mlp": 1.03930664, + "epoch": 0.555021161985379, + "flos": 503952053760.0, + "grad_norm": 0.16390384373312603, + "language_loss": 0.81600153, + "learning_rate": 0.0004353804524913551, + "loss": 0.82670736, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.3125, + "step": 2885, + "time_per_iteration": 2.6043736934661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068449, + "balance_loss_mlp": 1.03721642, + "epoch": 0.5552135436706426, + "flos": 615782020608.0, + "grad_norm": 0.06199045057720987, + "language_loss": 0.81625175, + "learning_rate": 0.0004350715355068441, + "loss": 0.82693619, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.31225586, + "step": 2886, + "time_per_iteration": 2.7229857444763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072103, + "balance_loss_mlp": 1.04051256, + "epoch": 0.5554059253559062, + "flos": 463635933696.0, + "grad_norm": 0.06868325666686464, + "language_loss": 0.79814357, + "learning_rate": 0.00043476264372979847, + "loss": 0.80886459, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.31567383, + "step": 2887, + "time_per_iteration": 2.5191705226898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071885, + "balance_loss_mlp": 1.0417012, + "epoch": 0.5555983070411696, + "flos": 1561923813888.0, + "grad_norm": 0.07224884026335429, + "language_loss": 0.78504527, + "learning_rate": 0.0004344537772801408, + "loss": 0.79576409, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.30151367, + "step": 2888, + "time_per_iteration": 3.803917646408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_mlp": 1.02040219, + "epoch": 0.5557906887264332, + "flos": 1467093468672.0, + "grad_norm": 0.021049912274883148, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74454963, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.12109375, + "step": 2889, + "time_per_iteration": 4.967891216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.04613566, + "epoch": 0.5559830704116968, + "flos": 529575750144.0, + "grad_norm": 0.06601593716549485, + "language_loss": 0.83441556, + "learning_rate": 0.0004338361208426298, + "loss": 0.84519023, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.31298828, + "step": 2890, + "time_per_iteration": 2.6076786518096924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_mlp": 1.0466727, + "epoch": 0.5561754520969604, + "flos": 650895077376.0, + "grad_norm": 0.05044338716051736, + "language_loss": 0.81248903, + "learning_rate": 0.00043352733109457164, + "loss": 0.82326382, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.30761719, + "step": 2891, + "time_per_iteration": 2.893113136291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081411, + "balance_loss_mlp": 1.05148911, + "epoch": 0.556367833782224, + "flos": 733968801792.0, + "grad_norm": 0.05185548617134015, + "language_loss": 0.84650671, + "learning_rate": 0.00043321856715349244, + "loss": 0.8573209, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.29907227, + "step": 2892, + "time_per_iteration": 2.9470455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05024242, + "epoch": 0.5565602154674875, + "flos": 672120405504.0, + "grad_norm": 0.060968656189677554, + "language_loss": 0.80153251, + "learning_rate": 0.00043290982913926466, + "loss": 0.81233752, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.30249023, + "step": 2893, + "time_per_iteration": 2.801114559173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.05283189, + "epoch": 0.556752597152751, + "flos": 585911162880.0, + "grad_norm": 0.06077441603872835, + "language_loss": 0.83792776, + "learning_rate": 0.0004326011171717514, + "loss": 0.84875673, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.30004883, + "step": 2894, + "time_per_iteration": 2.889112710952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077209, + "balance_loss_mlp": 1.04762125, + "epoch": 0.5569449788380146, + "flos": 437549548032.0, + "grad_norm": 0.06532751979042353, + "language_loss": 0.81112337, + "learning_rate": 0.0004322924313708051, + "loss": 0.82189548, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.29614258, + "step": 2895, + "time_per_iteration": 2.5237138271331787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04895401, + "epoch": 0.5571373605232782, + "flos": 502002372096.0, + "grad_norm": 0.06395509577189365, + "language_loss": 0.84357458, + "learning_rate": 0.0004319837718562681, + "loss": 0.85435069, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.28686523, + "step": 2896, + "time_per_iteration": 2.6235451698303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081945, + "balance_loss_mlp": 1.05123627, + "epoch": 0.5573297422085417, + "flos": 577126010880.0, + "grad_norm": 0.07087835610959153, + "language_loss": 0.82998407, + "learning_rate": 0.0004316751387479726, + "loss": 0.8408035, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.30664062, + "step": 2897, + "time_per_iteration": 2.7460193634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081079, + "balance_loss_mlp": 1.05115747, + "epoch": 0.5575221238938053, + "flos": 1343536754688.0, + "grad_norm": 0.06734561564060734, + "language_loss": 0.82601708, + "learning_rate": 0.0004313665321657409, + "loss": 0.83682787, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.29882812, + "step": 2898, + "time_per_iteration": 3.700585126876831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083979, + "balance_loss_mlp": 1.05393827, + "epoch": 0.5577145055790689, + "flos": 601680324096.0, + "grad_norm": 0.06408348461050545, + "language_loss": 0.79922706, + "learning_rate": 0.00043105795222938436, + "loss": 0.81006682, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.30004883, + "step": 2899, + "time_per_iteration": 2.785468816757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077879, + "balance_loss_mlp": 1.04776657, + "epoch": 0.5579068872643325, + "flos": 562353601536.0, + "grad_norm": 0.056878366734987945, + "language_loss": 0.78559703, + "learning_rate": 0.00043074939905870467, + "loss": 0.79637581, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.30078125, + "step": 2900, + "time_per_iteration": 2.6782429218292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081281, + "balance_loss_mlp": 1.05157411, + "epoch": 0.558099268949596, + "flos": 544292904960.0, + "grad_norm": 0.061480860141572814, + "language_loss": 0.806315, + "learning_rate": 0.0004304408727734927, + "loss": 0.81712782, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.296875, + "step": 2901, + "time_per_iteration": 2.6361851692199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089927, + "balance_loss_mlp": 1.05955291, + "epoch": 0.5582916506348595, + "flos": 552520825344.0, + "grad_norm": 0.045249909626423154, + "language_loss": 0.88812852, + "learning_rate": 0.0004301323734935288, + "loss": 0.89902782, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.3034668, + "step": 2902, + "time_per_iteration": 2.650801181793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085072, + "balance_loss_mlp": 1.05541265, + "epoch": 0.5584840323201231, + "flos": 543126007296.0, + "grad_norm": 0.061039385793722846, + "language_loss": 0.87144208, + "learning_rate": 0.000429823901338583, + "loss": 0.88229275, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.29638672, + "step": 2903, + "time_per_iteration": 2.603729486465454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108106, + "balance_loss_mlp": 1.05128181, + "epoch": 0.5586764140053867, + "flos": 815212118016.0, + "grad_norm": 0.060582508535745275, + "language_loss": 0.86712891, + "learning_rate": 0.00042951545642841513, + "loss": 0.87793946, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.29711914, + "step": 2904, + "time_per_iteration": 3.0844316482543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05437517, + "epoch": 0.5588687956906503, + "flos": 486196895232.0, + "grad_norm": 0.055991570648287706, + "language_loss": 0.86597067, + "learning_rate": 0.0004292070388827737, + "loss": 0.87681645, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.30175781, + "step": 2905, + "time_per_iteration": 2.561948537826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082655, + "balance_loss_mlp": 1.0526619, + "epoch": 0.5590611773759138, + "flos": 451809805824.0, + "grad_norm": 0.06056202554709599, + "language_loss": 0.80913132, + "learning_rate": 0.00042889864882139753, + "loss": 0.81995785, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.29956055, + "step": 2906, + "time_per_iteration": 2.584385871887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088672, + "balance_loss_mlp": 1.05913234, + "epoch": 0.5592535590611774, + "flos": 520693083648.0, + "grad_norm": 0.05654682862292604, + "language_loss": 0.81697655, + "learning_rate": 0.0004285902863640139, + "loss": 0.82786322, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.29516602, + "step": 2907, + "time_per_iteration": 2.598034620285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084011, + "balance_loss_mlp": 1.05342221, + "epoch": 0.5594459407464409, + "flos": 552250192896.0, + "grad_norm": 0.05788374674587666, + "language_loss": 0.85753977, + "learning_rate": 0.00042828195163033966, + "loss": 0.86837995, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.30566406, + "step": 2908, + "time_per_iteration": 2.654411792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081373, + "balance_loss_mlp": 1.05099869, + "epoch": 0.5596383224317045, + "flos": 484596421632.0, + "grad_norm": 0.05647224332708591, + "language_loss": 0.79214805, + "learning_rate": 0.0004279736447400812, + "loss": 0.80296183, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.30322266, + "step": 2909, + "time_per_iteration": 2.6054940223693848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108459, + "balance_loss_mlp": 1.05421579, + "epoch": 0.5598307041169681, + "flos": 610976217600.0, + "grad_norm": 0.05245180641385236, + "language_loss": 0.78436708, + "learning_rate": 0.00042766536581293385, + "loss": 0.79521292, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.3034668, + "step": 2910, + "time_per_iteration": 2.735391139984131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086034, + "balance_loss_mlp": 1.0553261, + "epoch": 0.5600230858022316, + "flos": 488585945088.0, + "grad_norm": 0.07209314448313818, + "language_loss": 0.79203892, + "learning_rate": 0.0004273571149685819, + "loss": 0.80289924, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.30664062, + "step": 2911, + "time_per_iteration": 2.7689387798309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081503, + "balance_loss_mlp": 1.05234432, + "epoch": 0.5602154674874952, + "flos": 598592858112.0, + "grad_norm": 0.05523073387542819, + "language_loss": 0.8391124, + "learning_rate": 0.00042704889232669937, + "loss": 0.84992743, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.29125977, + "step": 2912, + "time_per_iteration": 2.7328362464904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082045, + "balance_loss_mlp": 1.05288625, + "epoch": 0.5604078491727588, + "flos": 585697347072.0, + "grad_norm": 0.0608748772154565, + "language_loss": 0.85180819, + "learning_rate": 0.0004267406980069484, + "loss": 0.8626287, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.29150391, + "step": 2913, + "time_per_iteration": 2.6889522075653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_mlp": 1.05416012, + "epoch": 0.5606002308580224, + "flos": 540926042112.0, + "grad_norm": 0.0517518520900543, + "language_loss": 0.79621083, + "learning_rate": 0.0004264325321289808, + "loss": 0.80704308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.2902832, + "step": 2914, + "time_per_iteration": 2.7854018211364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080994, + "balance_loss_mlp": 1.05145359, + "epoch": 0.5607926125432858, + "flos": 583654533120.0, + "grad_norm": 0.05874282962966631, + "language_loss": 0.86178029, + "learning_rate": 0.00042612439481243736, + "loss": 0.87259024, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.29516602, + "step": 2915, + "time_per_iteration": 2.7484261989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082353, + "balance_loss_mlp": 1.05264628, + "epoch": 0.5609849942285494, + "flos": 627205095936.0, + "grad_norm": 0.06045457404054478, + "language_loss": 0.89827836, + "learning_rate": 0.00042581628617694735, + "loss": 0.90910184, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.29663086, + "step": 2916, + "time_per_iteration": 2.7450428009033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108385, + "balance_loss_mlp": 1.05376196, + "epoch": 0.561177375913813, + "flos": 588095161344.0, + "grad_norm": 0.06174360046329572, + "language_loss": 0.81716877, + "learning_rate": 0.0004255082063421296, + "loss": 0.82800722, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.30078125, + "step": 2917, + "time_per_iteration": 2.681556463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080705, + "balance_loss_mlp": 1.0505209, + "epoch": 0.5613697575990766, + "flos": 526774883328.0, + "grad_norm": 0.07215647610626674, + "language_loss": 0.85068524, + "learning_rate": 0.00042520015542759065, + "loss": 0.86149234, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.30151367, + "step": 2918, + "time_per_iteration": 2.838871717453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083881, + "balance_loss_mlp": 1.05379248, + "epoch": 0.5615621392843402, + "flos": 642351444480.0, + "grad_norm": 0.06380613116798055, + "language_loss": 0.88105166, + "learning_rate": 0.00042489213355292687, + "loss": 0.89189053, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.30053711, + "step": 2919, + "time_per_iteration": 2.882988214492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081698, + "balance_loss_mlp": 1.0521102, + "epoch": 0.5617545209696037, + "flos": 427524715008.0, + "grad_norm": 0.05903342570268675, + "language_loss": 0.80986512, + "learning_rate": 0.00042458414083772276, + "loss": 0.82068217, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.29541016, + "step": 2920, + "time_per_iteration": 2.520209550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107915, + "balance_loss_mlp": 1.04829907, + "epoch": 0.5619469026548672, + "flos": 568140037632.0, + "grad_norm": 0.05182413981421792, + "language_loss": 0.85047603, + "learning_rate": 0.000424276177401552, + "loss": 0.86126757, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.30810547, + "step": 2921, + "time_per_iteration": 2.777956008911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.04435039, + "epoch": 0.5621392843401308, + "flos": 504947243520.0, + "grad_norm": 0.05854064719302618, + "language_loss": 0.85700345, + "learning_rate": 0.0004239682433639763, + "loss": 0.86775458, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.30712891, + "step": 2922, + "time_per_iteration": 2.658231019973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074103, + "balance_loss_mlp": 1.04344249, + "epoch": 0.5623316660253944, + "flos": 516744258048.0, + "grad_norm": 0.07532891292065343, + "language_loss": 0.85277867, + "learning_rate": 0.0004236603388445467, + "loss": 0.86351973, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.30639648, + "step": 2923, + "time_per_iteration": 2.5820417404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073675, + "balance_loss_mlp": 1.04346776, + "epoch": 0.5625240477106579, + "flos": 605732456448.0, + "grad_norm": 0.05777778027932593, + "language_loss": 0.82139969, + "learning_rate": 0.00042335246396280166, + "loss": 0.83213639, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.30151367, + "step": 2924, + "time_per_iteration": 2.7298922538757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5627164293959215, + "flos": 450203539968.0, + "grad_norm": 0.06950178029529624, + "language_loss": 0.90437222, + "learning_rate": 0.0004230446188382693, + "loss": 0.9151001, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.30761719, + "step": 2925, + "time_per_iteration": 2.533452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072327, + "balance_loss_mlp": 1.04133308, + "epoch": 0.5629088110811851, + "flos": 741734032896.0, + "grad_norm": 0.061159313769390204, + "language_loss": 0.80411077, + "learning_rate": 0.0004227368035904654, + "loss": 0.81483406, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.30957031, + "step": 2926, + "time_per_iteration": 2.953749895095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04001379, + "epoch": 0.5631011927664487, + "flos": 496719323136.0, + "grad_norm": 0.05619049718209651, + "language_loss": 0.82702053, + "learning_rate": 0.00042242901833889474, + "loss": 0.83772445, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.30322266, + "step": 2927, + "time_per_iteration": 2.6141388416290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079835, + "balance_loss_mlp": 1.04977047, + "epoch": 0.5632935744517122, + "flos": 885774445056.0, + "grad_norm": 0.06403217415420936, + "language_loss": 0.86264247, + "learning_rate": 0.0004221212632030501, + "loss": 0.8734408, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.30004883, + "step": 2928, + "time_per_iteration": 3.0815889835357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079959, + "balance_loss_mlp": 1.04953694, + "epoch": 0.5634859561369757, + "flos": 604516096512.0, + "grad_norm": 0.0586888061552407, + "language_loss": 0.7995134, + "learning_rate": 0.0004218135383024124, + "loss": 0.81031299, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.30395508, + "step": 2929, + "time_per_iteration": 2.7041475772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074718, + "balance_loss_mlp": 1.04417634, + "epoch": 0.5636783378222393, + "flos": 453670737408.0, + "grad_norm": 0.06027811401713532, + "language_loss": 0.84979665, + "learning_rate": 0.0004215058437564511, + "loss": 0.86054391, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.30493164, + "step": 2930, + "time_per_iteration": 2.5627479553222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074654, + "balance_loss_mlp": 1.04427934, + "epoch": 0.5638707195075029, + "flos": 518206519296.0, + "grad_norm": 0.054381619158741505, + "language_loss": 0.8244099, + "learning_rate": 0.00042119817968462397, + "loss": 0.83515644, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.30322266, + "step": 2931, + "time_per_iteration": 2.5824992656707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076007, + "balance_loss_mlp": 1.04517913, + "epoch": 0.5640631011927665, + "flos": 564632142336.0, + "grad_norm": 0.06458971753482587, + "language_loss": 0.86743045, + "learning_rate": 0.0004208905462063766, + "loss": 0.87819058, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.30786133, + "step": 2932, + "time_per_iteration": 2.6889755725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075474, + "balance_loss_mlp": 1.04447937, + "epoch": 0.56425548287803, + "flos": 516783545856.0, + "grad_norm": 0.05636003677155103, + "language_loss": 0.84317416, + "learning_rate": 0.00042058294344114315, + "loss": 0.85392892, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.30957031, + "step": 2933, + "time_per_iteration": 2.626492500305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073066, + "balance_loss_mlp": 1.0428108, + "epoch": 0.5644478645632935, + "flos": 853907415552.0, + "grad_norm": 0.05419859074132438, + "language_loss": 0.77552223, + "learning_rate": 0.0004202753715083456, + "loss": 0.78625292, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.30224609, + "step": 2934, + "time_per_iteration": 3.0855889320373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077905, + "balance_loss_mlp": 1.04767334, + "epoch": 0.5646402462485571, + "flos": 553175571456.0, + "grad_norm": 0.0600578906837947, + "language_loss": 0.81160748, + "learning_rate": 0.0004199678305273936, + "loss": 0.8223865, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.30200195, + "step": 2935, + "time_per_iteration": 2.680676221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072428, + "balance_loss_mlp": 1.04176772, + "epoch": 0.5648326279338207, + "flos": 685661898240.0, + "grad_norm": 0.07403764487671594, + "language_loss": 0.81138289, + "learning_rate": 0.0004196603206176854, + "loss": 0.8221072, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.30615234, + "step": 2936, + "time_per_iteration": 2.930933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084589, + "balance_loss_mlp": 1.05526328, + "epoch": 0.5650250096190843, + "flos": 802990291968.0, + "grad_norm": 0.06763515513860026, + "language_loss": 0.8344292, + "learning_rate": 0.000419352841898607, + "loss": 0.8452751, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.29272461, + "step": 2937, + "time_per_iteration": 2.983389377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076783, + "balance_loss_mlp": 1.04714775, + "epoch": 0.5652173913043478, + "flos": 581787809280.0, + "grad_norm": 0.06159153322850295, + "language_loss": 0.77355075, + "learning_rate": 0.000419045394489532, + "loss": 0.78431857, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.29589844, + "step": 2938, + "time_per_iteration": 2.7125768661499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082739, + "balance_loss_mlp": 1.05229306, + "epoch": 0.5654097729896114, + "flos": 820269614592.0, + "grad_norm": 0.051986884313783496, + "language_loss": 0.76774859, + "learning_rate": 0.0004187379785098224, + "loss": 0.77857602, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.30395508, + "step": 2939, + "time_per_iteration": 3.127896547317505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078707, + "balance_loss_mlp": 1.04854691, + "epoch": 0.565602154674875, + "flos": 783826716672.0, + "grad_norm": 0.05965997721506439, + "language_loss": 0.83921504, + "learning_rate": 0.00041843059407882744, + "loss": 0.85000205, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.30126953, + "step": 2940, + "time_per_iteration": 2.97220778465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010812, + "balance_loss_mlp": 1.05113554, + "epoch": 0.5657945363601385, + "flos": 549418802688.0, + "grad_norm": 0.05367108270531433, + "language_loss": 0.82534146, + "learning_rate": 0.0004181232413158842, + "loss": 0.83615345, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.30004883, + "step": 2941, + "time_per_iteration": 2.642336368560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108333, + "balance_loss_mlp": 1.05405188, + "epoch": 0.5659869180454021, + "flos": 667826754048.0, + "grad_norm": 0.06412651995290534, + "language_loss": 0.82513189, + "learning_rate": 0.0004178159203403179, + "loss": 0.83596516, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.29272461, + "step": 2942, + "time_per_iteration": 2.856449842453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082217, + "balance_loss_mlp": 1.05260575, + "epoch": 0.5661792997306656, + "flos": 499707864576.0, + "grad_norm": 0.056771241115104176, + "language_loss": 0.81273901, + "learning_rate": 0.0004175086312714409, + "loss": 0.82356119, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.2956543, + "step": 2943, + "time_per_iteration": 2.62709903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088098, + "balance_loss_mlp": 1.05898714, + "epoch": 0.5663716814159292, + "flos": 600922271232.0, + "grad_norm": 0.050224853353863855, + "language_loss": 0.83679438, + "learning_rate": 0.00041720137422855366, + "loss": 0.84767538, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.29052734, + "step": 2944, + "time_per_iteration": 2.730576515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086164, + "balance_loss_mlp": 1.05710077, + "epoch": 0.5665640631011928, + "flos": 540728193024.0, + "grad_norm": 0.0578384318096137, + "language_loss": 0.78684467, + "learning_rate": 0.00041689414933094383, + "loss": 0.79770631, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.2902832, + "step": 2945, + "time_per_iteration": 2.6317861080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084966, + "balance_loss_mlp": 1.05483007, + "epoch": 0.5667564447864564, + "flos": 601655592960.0, + "grad_norm": 0.061631419209263724, + "language_loss": 0.80986917, + "learning_rate": 0.00041658695669788653, + "loss": 0.82071877, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.30102539, + "step": 2946, + "time_per_iteration": 2.766889810562134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083037, + "balance_loss_mlp": 1.05352092, + "epoch": 0.5669488264717198, + "flos": 659224894464.0, + "grad_norm": 0.08686938236765575, + "language_loss": 0.81373537, + "learning_rate": 0.00041627979644864453, + "loss": 0.82456571, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.29467773, + "step": 2947, + "time_per_iteration": 2.7937870025634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085685, + "balance_loss_mlp": 1.0563122, + "epoch": 0.5671412081569834, + "flos": 485158035456.0, + "grad_norm": 0.05686002455066826, + "language_loss": 0.81299067, + "learning_rate": 0.0004159726687024683, + "loss": 0.82384753, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.29345703, + "step": 2948, + "time_per_iteration": 2.636784791946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05417752, + "epoch": 0.567333589842247, + "flos": 729487475712.0, + "grad_norm": 0.057207156589959604, + "language_loss": 0.7857877, + "learning_rate": 0.00041566557357859506, + "loss": 0.79662293, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.29321289, + "step": 2949, + "time_per_iteration": 2.8607821464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080781, + "balance_loss_mlp": 1.05131269, + "epoch": 0.5675259715275106, + "flos": 968471258112.0, + "grad_norm": 0.050618871180039625, + "language_loss": 0.79166919, + "learning_rate": 0.0004153585111962502, + "loss": 0.802477, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.29443359, + "step": 2950, + "time_per_iteration": 3.306715250015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05387974, + "epoch": 0.5677183532127742, + "flos": 564879453696.0, + "grad_norm": 0.08196542197504524, + "language_loss": 0.84189069, + "learning_rate": 0.0004150514816746453, + "loss": 0.85272491, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.29492188, + "step": 2951, + "time_per_iteration": 2.6732659339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_mlp": 1.05190265, + "epoch": 0.5679107348980377, + "flos": 551432503296.0, + "grad_norm": 0.06474663434913709, + "language_loss": 0.85581088, + "learning_rate": 0.0004147444851329802, + "loss": 0.86662048, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.29003906, + "step": 2952, + "time_per_iteration": 2.647568941116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079758, + "balance_loss_mlp": 1.05081391, + "epoch": 0.5681031165833013, + "flos": 819115863552.0, + "grad_norm": 0.0574748240063073, + "language_loss": 0.85410154, + "learning_rate": 0.00041443752169044126, + "loss": 0.8648991, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.28955078, + "step": 2953, + "time_per_iteration": 3.018815040588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081341, + "balance_loss_mlp": 1.05227828, + "epoch": 0.5682954982685648, + "flos": 617731702272.0, + "grad_norm": 0.05380576703697579, + "language_loss": 0.846789, + "learning_rate": 0.0004141305914662025, + "loss": 0.85760248, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.29052734, + "step": 2954, + "time_per_iteration": 2.7356324195861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088016, + "balance_loss_mlp": 1.05807066, + "epoch": 0.5684878799538284, + "flos": 647625729024.0, + "grad_norm": 0.05392421630137883, + "language_loss": 0.80538452, + "learning_rate": 0.0004138236945794246, + "loss": 0.81626463, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.29907227, + "step": 2955, + "time_per_iteration": 2.8904106616973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082907, + "balance_loss_mlp": 1.05439222, + "epoch": 0.5686802616390919, + "flos": 805615068672.0, + "grad_norm": 0.07320613099583566, + "language_loss": 0.83898306, + "learning_rate": 0.00041351683114925576, + "loss": 0.84981215, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.28491211, + "step": 2956, + "time_per_iteration": 3.0756330490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085945, + "balance_loss_mlp": 1.05683398, + "epoch": 0.5688726433243555, + "flos": 546882776064.0, + "grad_norm": 0.05933823821942172, + "language_loss": 0.86556458, + "learning_rate": 0.0004132100012948308, + "loss": 0.87642407, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.29077148, + "step": 2957, + "time_per_iteration": 2.6803860664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.05614674, + "epoch": 0.5690650250096191, + "flos": 486324933120.0, + "grad_norm": 0.06187903851247569, + "language_loss": 0.84050244, + "learning_rate": 0.00041290320513527145, + "loss": 0.85135645, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.29248047, + "step": 2958, + "time_per_iteration": 2.54225754737854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084682, + "balance_loss_mlp": 1.05545211, + "epoch": 0.5692574066948827, + "flos": 577184237568.0, + "grad_norm": 0.04955077863713089, + "language_loss": 0.85089266, + "learning_rate": 0.0004125964427896867, + "loss": 0.86173952, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.29199219, + "step": 2959, + "time_per_iteration": 2.716848611831665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083431, + "balance_loss_mlp": 1.0530802, + "epoch": 0.5694497883801463, + "flos": 454005388800.0, + "grad_norm": 0.0635030186812047, + "language_loss": 0.79277623, + "learning_rate": 0.0004122897143771723, + "loss": 0.80361056, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.30297852, + "step": 2960, + "time_per_iteration": 2.53230357170105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086179, + "balance_loss_mlp": 1.05628169, + "epoch": 0.5696421700654097, + "flos": 559251578880.0, + "grad_norm": 0.052407613892641675, + "language_loss": 0.81192493, + "learning_rate": 0.0004119830200168109, + "loss": 0.82278675, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.29858398, + "step": 2961, + "time_per_iteration": 2.684126377105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.05355775, + "epoch": 0.5698345517506733, + "flos": 465314982912.0, + "grad_norm": 0.06121192976286501, + "language_loss": 0.88053119, + "learning_rate": 0.0004116763598276714, + "loss": 0.89136672, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.29956055, + "step": 2962, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108181, + "balance_loss_mlp": 1.05138803, + "epoch": 0.5700269334359369, + "flos": 605645116416.0, + "grad_norm": 0.069996546899228, + "language_loss": 0.8081792, + "learning_rate": 0.00041136973392881017, + "loss": 0.81899732, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.30395508, + "step": 2963, + "time_per_iteration": 2.8093085289001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.05357933, + "epoch": 0.5702193151212005, + "flos": 562423412736.0, + "grad_norm": 0.06390032386968057, + "language_loss": 0.8227576, + "learning_rate": 0.00041106314243926983, + "loss": 0.8335923, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.29858398, + "step": 2964, + "time_per_iteration": 2.740004062652588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080188, + "balance_loss_mlp": 1.05062366, + "epoch": 0.570411696806464, + "flos": 522983208960.0, + "grad_norm": 0.060533570265575896, + "language_loss": 0.87250763, + "learning_rate": 0.0004107565854780798, + "loss": 0.88330954, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.29516602, + "step": 2965, + "time_per_iteration": 2.6749136447906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080245, + "balance_loss_mlp": 1.05111039, + "epoch": 0.5706040784917276, + "flos": 717911631360.0, + "grad_norm": 0.06664541213513904, + "language_loss": 0.80888879, + "learning_rate": 0.000410450063164256, + "loss": 0.81969118, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.29077148, + "step": 2966, + "time_per_iteration": 2.8448963165283203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081067, + "balance_loss_mlp": 1.05081153, + "epoch": 0.5707964601769911, + "flos": 476467425792.0, + "grad_norm": 0.06804112412049489, + "language_loss": 0.82108605, + "learning_rate": 0.00041014357561680115, + "loss": 0.83189678, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.30200195, + "step": 2967, + "time_per_iteration": 2.5226550102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084124, + "balance_loss_mlp": 1.0544889, + "epoch": 0.5709888418622547, + "flos": 579823570944.0, + "grad_norm": 0.059986306134107735, + "language_loss": 0.86107051, + "learning_rate": 0.0004098371229547039, + "loss": 0.87191176, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.29589844, + "step": 2968, + "time_per_iteration": 2.7232651710510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_mlp": 1.03398585, + "epoch": 0.5711812235475183, + "flos": 1579108290048.0, + "grad_norm": 0.025451731838023718, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81057, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.12207031, + "step": 2969, + "time_per_iteration": 4.785320997238159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082869, + "balance_loss_mlp": 1.05330527, + "epoch": 0.5713736052327818, + "flos": 468259854336.0, + "grad_norm": 0.07178133530641487, + "language_loss": 0.80500889, + "learning_rate": 0.00040922432276247107, + "loss": 0.81583756, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.29516602, + "step": 2970, + "time_per_iteration": 2.5877230167388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086085, + "balance_loss_mlp": 1.05635428, + "epoch": 0.5715659869180454, + "flos": 537390443520.0, + "grad_norm": 0.05561639186548029, + "language_loss": 0.84452176, + "learning_rate": 0.0004089179754702457, + "loss": 0.85538256, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.29663086, + "step": 2971, + "time_per_iteration": 2.759932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084469, + "balance_loss_mlp": 1.05469072, + "epoch": 0.571758368603309, + "flos": 655778045952.0, + "grad_norm": 0.05716809371830958, + "language_loss": 0.79499936, + "learning_rate": 0.00040861166353919843, + "loss": 0.80584407, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.29711914, + "step": 2972, + "time_per_iteration": 2.856147050857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080407, + "balance_loss_mlp": 1.05213094, + "epoch": 0.5719507502885726, + "flos": 667609966080.0, + "grad_norm": 0.054720530113361164, + "language_loss": 0.81279707, + "learning_rate": 0.00040830538708824983, + "loss": 0.82360113, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.28295898, + "step": 2973, + "time_per_iteration": 2.9099643230438232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083682, + "balance_loss_mlp": 1.05414152, + "epoch": 0.572143131973836, + "flos": 476083312128.0, + "grad_norm": 0.059341772904328634, + "language_loss": 0.81557322, + "learning_rate": 0.000407999146236307, + "loss": 0.82641, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.29492188, + "step": 2974, + "time_per_iteration": 2.5506579875946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087372, + "balance_loss_mlp": 1.05807054, + "epoch": 0.5723355136590996, + "flos": 539255757312.0, + "grad_norm": 0.05823834072467256, + "language_loss": 0.8320694, + "learning_rate": 0.0004076929411022634, + "loss": 0.84294319, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.29248047, + "step": 2975, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081032, + "balance_loss_mlp": 1.05125356, + "epoch": 0.5725278953443632, + "flos": 823784864256.0, + "grad_norm": 0.059359253337435705, + "language_loss": 0.79102635, + "learning_rate": 0.0004073867718049982, + "loss": 0.80183673, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.29736328, + "step": 2976, + "time_per_iteration": 3.104320526123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087781, + "balance_loss_mlp": 1.05745435, + "epoch": 0.5727202770296268, + "flos": 587155226112.0, + "grad_norm": 0.06002278348442279, + "language_loss": 0.82387239, + "learning_rate": 0.00040708063846337704, + "loss": 0.83475018, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.30273438, + "step": 2977, + "time_per_iteration": 2.7141377925872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.05906403, + "epoch": 0.5729126587148904, + "flos": 446723195904.0, + "grad_norm": 0.05629415234265891, + "language_loss": 0.81140733, + "learning_rate": 0.00040677454119625143, + "loss": 0.82229173, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.29321289, + "step": 2978, + "time_per_iteration": 2.5579118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.04967451, + "epoch": 0.5731050404001539, + "flos": 519206091264.0, + "grad_norm": 0.06287623577372331, + "language_loss": 0.82978582, + "learning_rate": 0.0004064684801224587, + "loss": 0.84058082, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.2980957, + "step": 2979, + "time_per_iteration": 2.6184630393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080607, + "balance_loss_mlp": 1.05047131, + "epoch": 0.5732974220854175, + "flos": 504528224256.0, + "grad_norm": 0.049858532305801305, + "language_loss": 0.80364764, + "learning_rate": 0.00040616245536082224, + "loss": 0.81445372, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.30078125, + "step": 2980, + "time_per_iteration": 2.605652093887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_mlp": 1.04602742, + "epoch": 0.573489803770681, + "flos": 592187991552.0, + "grad_norm": 0.05649585275193457, + "language_loss": 0.81399214, + "learning_rate": 0.00040585646703015165, + "loss": 0.82474685, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.29418945, + "step": 2981, + "time_per_iteration": 2.8440651893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081482, + "balance_loss_mlp": 1.05103636, + "epoch": 0.5736821854559446, + "flos": 489672857088.0, + "grad_norm": 0.0633133856450646, + "language_loss": 0.78068441, + "learning_rate": 0.0004055505152492419, + "loss": 0.79149926, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.30419922, + "step": 2982, + "time_per_iteration": 2.7125117778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076312, + "balance_loss_mlp": 1.0467, + "epoch": 0.5738745671412081, + "flos": 457895987712.0, + "grad_norm": 0.057765721767923175, + "language_loss": 0.74208528, + "learning_rate": 0.00040524460013687425, + "loss": 0.75284839, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.29589844, + "step": 2983, + "time_per_iteration": 2.7232775688171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080701, + "balance_loss_mlp": 1.05151832, + "epoch": 0.5740669488264717, + "flos": 580012655616.0, + "grad_norm": 0.049591997410844156, + "language_loss": 0.81157619, + "learning_rate": 0.0004049387218118155, + "loss": 0.82238322, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.29199219, + "step": 2984, + "time_per_iteration": 2.956636428833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080147, + "balance_loss_mlp": 1.04934323, + "epoch": 0.5742593305117353, + "flos": 524155898880.0, + "grad_norm": 0.06847869877575175, + "language_loss": 0.84987867, + "learning_rate": 0.00040463288039281777, + "loss": 0.8606801, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.30761719, + "step": 2985, + "time_per_iteration": 2.7503554821014404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013561, + "balance_loss_mlp": 1.00078201, + "epoch": 0.5744517121969989, + "flos": 1553033488896.0, + "grad_norm": 0.012095267017415088, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78889978, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.12792969, + "step": 2986, + "time_per_iteration": 5.030332565307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079255, + "balance_loss_mlp": 1.04981041, + "epoch": 0.5746440938822625, + "flos": 751600304640.0, + "grad_norm": 0.055809040190366505, + "language_loss": 0.82136881, + "learning_rate": 0.0004040213087479444, + "loss": 0.83216131, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.29443359, + "step": 2987, + "time_per_iteration": 2.926941156387329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087088, + "balance_loss_mlp": 1.05816782, + "epoch": 0.5748364755675259, + "flos": 501618258432.0, + "grad_norm": 0.06868722002267488, + "language_loss": 0.85331053, + "learning_rate": 0.0004037155787595018, + "loss": 0.8641814, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.28857422, + "step": 2988, + "time_per_iteration": 2.561497211456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085606, + "balance_loss_mlp": 1.05599451, + "epoch": 0.5750288572527895, + "flos": 503757024768.0, + "grad_norm": 0.05119655910511677, + "language_loss": 0.80321741, + "learning_rate": 0.000403409886151987, + "loss": 0.81407344, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.29589844, + "step": 2989, + "time_per_iteration": 2.9114019870758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013296, + "balance_loss_mlp": 1.00061202, + "epoch": 0.5752212389380531, + "flos": 1540541030400.0, + "grad_norm": 0.008836939301122537, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.83012402, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.12695312, + "step": 2990, + "time_per_iteration": 4.770756483078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013357, + "balance_loss_mlp": 1.00086439, + "epoch": 0.5754136206233167, + "flos": 1566499378176.0, + "grad_norm": 0.007697309180098509, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79211962, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.125, + "step": 2991, + "time_per_iteration": 4.786288499832153 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083775, + "balance_loss_mlp": 1.05537939, + "epoch": 0.5756060023085803, + "flos": 797806167552.0, + "grad_norm": 0.05348004588160335, + "language_loss": 0.76926208, + "learning_rate": 0.00040249303380173807, + "loss": 0.78009981, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.28369141, + "step": 2992, + "time_per_iteration": 3.0660438537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010851, + "balance_loss_mlp": 1.05629849, + "epoch": 0.5757983839938438, + "flos": 587588802048.0, + "grad_norm": 0.06048493616630367, + "language_loss": 0.79311389, + "learning_rate": 0.00040218749190459126, + "loss": 0.80396485, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.28808594, + "step": 2993, + "time_per_iteration": 2.7251527309417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084541, + "balance_loss_mlp": 1.05514371, + "epoch": 0.5759907656791073, + "flos": 516576932352.0, + "grad_norm": 0.0697186971943442, + "language_loss": 0.82477212, + "learning_rate": 0.00040188198798162775, + "loss": 0.83561754, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.29370117, + "step": 2994, + "time_per_iteration": 2.6159136295318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081131, + "balance_loss_mlp": 1.05147123, + "epoch": 0.5761831473643709, + "flos": 586845305856.0, + "grad_norm": 0.057556686362034246, + "language_loss": 0.85848254, + "learning_rate": 0.000401576522151455, + "loss": 0.86929381, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.29614258, + "step": 2995, + "time_per_iteration": 2.811438798904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108636, + "balance_loss_mlp": 1.05775023, + "epoch": 0.5763755290496345, + "flos": 543619219968.0, + "grad_norm": 0.04540215088386673, + "language_loss": 0.82446247, + "learning_rate": 0.0004012710945326651, + "loss": 0.83532608, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.28613281, + "step": 2996, + "time_per_iteration": 2.778818368911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108663, + "balance_loss_mlp": 1.05790055, + "epoch": 0.576567910734898, + "flos": 625930509312.0, + "grad_norm": 0.049519109180824444, + "language_loss": 0.81129038, + "learning_rate": 0.0004009657052438355, + "loss": 0.82215673, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.28686523, + "step": 2997, + "time_per_iteration": 2.8787920475006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094954, + "balance_loss_mlp": 1.06612968, + "epoch": 0.5767602924201616, + "flos": 537985552896.0, + "grad_norm": 0.05906428447956742, + "language_loss": 0.85482752, + "learning_rate": 0.00040066035440352904, + "loss": 0.86577708, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.2878418, + "step": 2998, + "time_per_iteration": 2.634565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045759, + "balance_loss_mlp": 1.03379035, + "epoch": 0.5769526741054252, + "flos": 1558969873920.0, + "grad_norm": 0.021537766013807906, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80338895, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.11962891, + "step": 2999, + "time_per_iteration": 4.964475393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090784, + "balance_loss_mlp": 1.06248331, + "epoch": 0.5771450557906888, + "flos": 467939759616.0, + "grad_norm": 0.06837432109358414, + "language_loss": 0.75964624, + "learning_rate": 0.00040004976854266145, + "loss": 0.77055407, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.28295898, + "step": 3000, + "time_per_iteration": 2.5489282608032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089485, + "balance_loss_mlp": 1.06006408, + "epoch": 0.5773374374759523, + "flos": 574288828416.0, + "grad_norm": 0.0545980885089623, + "language_loss": 0.81222647, + "learning_rate": 0.0003997445337591505, + "loss": 0.82312131, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.29370117, + "step": 3001, + "time_per_iteration": 2.6890947818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108546, + "balance_loss_mlp": 1.05680251, + "epoch": 0.5775298191612158, + "flos": 528216795648.0, + "grad_norm": 0.06583721131765849, + "language_loss": 0.74093473, + "learning_rate": 0.0003994393378982635, + "loss": 0.75178933, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.28662109, + "step": 3002, + "time_per_iteration": 2.596644401550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_mlp": 1.03153443, + "epoch": 0.5777222008464794, + "flos": 1303178070528.0, + "grad_norm": 0.017943105040569007, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80581129, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.11572266, + "step": 3003, + "time_per_iteration": 4.826138257980347 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085564, + "balance_loss_mlp": 1.05666792, + "epoch": 0.577914582531743, + "flos": 603344816640.0, + "grad_norm": 0.058273014851323426, + "language_loss": 0.87901747, + "learning_rate": 0.0003988290634182961, + "loss": 0.88987309, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.28881836, + "step": 3004, + "time_per_iteration": 2.7604172229766846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088001, + "balance_loss_mlp": 1.06015372, + "epoch": 0.5781069642170066, + "flos": 486537338880.0, + "grad_norm": 0.06327449394997672, + "language_loss": 0.80677181, + "learning_rate": 0.0003985239850361453, + "loss": 0.81765187, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.27856445, + "step": 3005, + "time_per_iteration": 2.5994105339050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.06256592, + "epoch": 0.5782993459022701, + "flos": 506016626688.0, + "grad_norm": 0.057065414052448256, + "language_loss": 0.84621793, + "learning_rate": 0.0003982189460504777, + "loss": 0.85713327, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.28930664, + "step": 3006, + "time_per_iteration": 2.722778797149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.06261778, + "epoch": 0.5784917275875336, + "flos": 601872380928.0, + "grad_norm": 0.0654169545720973, + "language_loss": 0.79183024, + "learning_rate": 0.00039791394657971935, + "loss": 0.80274087, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.28442383, + "step": 3007, + "time_per_iteration": 2.7318689823150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.06056237, + "epoch": 0.5786841092727972, + "flos": 521279428608.0, + "grad_norm": 0.06429658550493057, + "language_loss": 0.84402883, + "learning_rate": 0.00039760898674228205, + "loss": 0.85492396, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.28930664, + "step": 3008, + "time_per_iteration": 2.6548941135406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087482, + "balance_loss_mlp": 1.05884826, + "epoch": 0.5788764909580608, + "flos": 767047809024.0, + "grad_norm": 0.0525681924040606, + "language_loss": 0.80782068, + "learning_rate": 0.0003973040666565613, + "loss": 0.81869543, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.28588867, + "step": 3009, + "time_per_iteration": 3.065049171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087663, + "balance_loss_mlp": 1.05972004, + "epoch": 0.5790688726433244, + "flos": 598786324992.0, + "grad_norm": 0.058928126410829465, + "language_loss": 0.81879556, + "learning_rate": 0.000396999186440938, + "loss": 0.82967222, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.27954102, + "step": 3010, + "time_per_iteration": 2.860755205154419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086781, + "balance_loss_mlp": 1.05871928, + "epoch": 0.5792612543285879, + "flos": 522805708800.0, + "grad_norm": 0.06775550082118927, + "language_loss": 0.84739363, + "learning_rate": 0.000396694346213777, + "loss": 0.85826147, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.28076172, + "step": 3011, + "time_per_iteration": 2.591801643371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077556, + "balance_loss_mlp": 1.04815888, + "epoch": 0.5794536360138515, + "flos": 876178805760.0, + "grad_norm": 0.09075774540794283, + "language_loss": 0.83682388, + "learning_rate": 0.0003963895460934276, + "loss": 0.84759945, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.29370117, + "step": 3012, + "time_per_iteration": 3.1549274921417236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080769, + "balance_loss_mlp": 1.05242133, + "epoch": 0.5796460176991151, + "flos": 401221541376.0, + "grad_norm": 0.07824771870324425, + "language_loss": 0.85031927, + "learning_rate": 0.00039608478619822376, + "loss": 0.86112702, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.28344727, + "step": 3013, + "time_per_iteration": 2.436859369277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108003, + "balance_loss_mlp": 1.05091906, + "epoch": 0.5798383993843786, + "flos": 618229297152.0, + "grad_norm": 0.07454312954276684, + "language_loss": 0.82720006, + "learning_rate": 0.00039578006664648394, + "loss": 0.83800036, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.29125977, + "step": 3014, + "time_per_iteration": 2.813934326171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.05350864, + "epoch": 0.5800307810696421, + "flos": 843966950400.0, + "grad_norm": 0.07429538018047967, + "language_loss": 0.81169355, + "learning_rate": 0.0003954753875565105, + "loss": 0.82251996, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.29101562, + "step": 3015, + "time_per_iteration": 3.089141607284546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075928, + "balance_loss_mlp": 1.04674578, + "epoch": 0.5802231627549057, + "flos": 569005779456.0, + "grad_norm": 0.053240000714227444, + "language_loss": 0.8237859, + "learning_rate": 0.00039517074904659057, + "loss": 0.8345452, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.29125977, + "step": 3016, + "time_per_iteration": 2.7315711975097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081141, + "balance_loss_mlp": 1.05217314, + "epoch": 0.5804155444401693, + "flos": 660160447488.0, + "grad_norm": 0.0618256833307492, + "language_loss": 0.84621388, + "learning_rate": 0.00039486615123499535, + "loss": 0.85702527, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.28955078, + "step": 3017, + "time_per_iteration": 2.870152235031128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082579, + "balance_loss_mlp": 1.05342066, + "epoch": 0.5806079261254329, + "flos": 513726603264.0, + "grad_norm": 0.06092979313789558, + "language_loss": 0.85065556, + "learning_rate": 0.00039456159423997996, + "loss": 0.86148143, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.29125977, + "step": 3018, + "time_per_iteration": 2.6494932174682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.04867649, + "epoch": 0.5808003078106965, + "flos": 528379739136.0, + "grad_norm": 0.05170574080230249, + "language_loss": 0.89520943, + "learning_rate": 0.00039425707817978406, + "loss": 0.90599209, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.29541016, + "step": 3019, + "time_per_iteration": 2.690485715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078627, + "balance_loss_mlp": 1.04894376, + "epoch": 0.58099268949596, + "flos": 476787520512.0, + "grad_norm": 0.06031161665678942, + "language_loss": 0.83372945, + "learning_rate": 0.00039395260317263124, + "loss": 0.84451568, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.29663086, + "step": 3020, + "time_per_iteration": 2.677818775177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076598, + "balance_loss_mlp": 1.0466764, + "epoch": 0.5811850711812235, + "flos": 517340777472.0, + "grad_norm": 0.056782275650517425, + "language_loss": 0.84907949, + "learning_rate": 0.0003936481693367291, + "loss": 0.8598454, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.29882812, + "step": 3021, + "time_per_iteration": 2.647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084791, + "balance_loss_mlp": 1.05491698, + "epoch": 0.5813774528664871, + "flos": 616122464256.0, + "grad_norm": 0.06733027879749674, + "language_loss": 0.87502337, + "learning_rate": 0.0003933437767902697, + "loss": 0.88587123, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.29833984, + "step": 3022, + "time_per_iteration": 2.825965166091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085273, + "balance_loss_mlp": 1.05706787, + "epoch": 0.5815698345517507, + "flos": 567194310144.0, + "grad_norm": 0.07318564796931465, + "language_loss": 0.78165317, + "learning_rate": 0.00039303942565142825, + "loss": 0.79250592, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.28222656, + "step": 3023, + "time_per_iteration": 2.7315845489501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087422, + "balance_loss_mlp": 1.0569042, + "epoch": 0.5817622162370142, + "flos": 562886102016.0, + "grad_norm": 0.052544940996134284, + "language_loss": 0.76741624, + "learning_rate": 0.0003927351160383644, + "loss": 0.77829051, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.3046875, + "step": 3024, + "time_per_iteration": 2.789477825164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085705, + "balance_loss_mlp": 1.05609322, + "epoch": 0.5819545979222778, + "flos": 458982899712.0, + "grad_norm": 0.07634686348045291, + "language_loss": 0.77796662, + "learning_rate": 0.000392430848069222, + "loss": 0.78882366, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.29589844, + "step": 3025, + "time_per_iteration": 2.5446279048919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085632, + "balance_loss_mlp": 1.05549598, + "epoch": 0.5821469796075414, + "flos": 541215613440.0, + "grad_norm": 0.05528071963535831, + "language_loss": 0.82223105, + "learning_rate": 0.00039212662186212795, + "loss": 0.83308738, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.30078125, + "step": 3026, + "time_per_iteration": 2.60878849029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_mlp": 1.04883003, + "epoch": 0.582339361292805, + "flos": 551994117120.0, + "grad_norm": 0.05052748911564131, + "language_loss": 0.76906562, + "learning_rate": 0.0003918224375351934, + "loss": 0.77986145, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.30737305, + "step": 3027, + "time_per_iteration": 2.709887742996216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_mlp": 1.05384469, + "epoch": 0.5825317429780685, + "flos": 496138770432.0, + "grad_norm": 0.05874903473435042, + "language_loss": 0.78473544, + "learning_rate": 0.0003915182952065135, + "loss": 0.79556859, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.29418945, + "step": 3028, + "time_per_iteration": 2.6885859966278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082208, + "balance_loss_mlp": 1.05250072, + "epoch": 0.582724124663332, + "flos": 563890056192.0, + "grad_norm": 0.06824855227929012, + "language_loss": 0.8751812, + "learning_rate": 0.0003912141949941664, + "loss": 0.88600326, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.296875, + "step": 3029, + "time_per_iteration": 2.7145774364471436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087726, + "balance_loss_mlp": 1.05799532, + "epoch": 0.5829165063485956, + "flos": 491888788992.0, + "grad_norm": 0.07682913079591057, + "language_loss": 0.82808822, + "learning_rate": 0.0003909101370162143, + "loss": 0.83896548, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.29711914, + "step": 3030, + "time_per_iteration": 2.6085238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063086, + "balance_loss_mlp": 1.05116475, + "epoch": 0.5831088880338592, + "flos": 1528134501888.0, + "grad_norm": 0.03433679117263603, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73496974, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.11914062, + "step": 3031, + "time_per_iteration": 4.894438028335571 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.05076766, + "epoch": 0.5833012697191228, + "flos": 617712763392.0, + "grad_norm": 0.0542485247275347, + "language_loss": 0.8270607, + "learning_rate": 0.0003903021482356622, + "loss": 0.83786714, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.29833984, + "step": 3032, + "time_per_iteration": 2.8060503005981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079071, + "balance_loss_mlp": 1.04924476, + "epoch": 0.5834936514043862, + "flos": 767578899456.0, + "grad_norm": 0.06913224268253564, + "language_loss": 0.8243112, + "learning_rate": 0.00038999821766910465, + "loss": 0.8351019, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.2980957, + "step": 3033, + "time_per_iteration": 3.013117551803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079849, + "balance_loss_mlp": 1.04992783, + "epoch": 0.5836860330896498, + "flos": 458136096768.0, + "grad_norm": 0.06539568057172108, + "language_loss": 0.85596031, + "learning_rate": 0.00038969432980902606, + "loss": 0.86675882, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.29907227, + "step": 3034, + "time_per_iteration": 2.602159261703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_mlp": 1.03642654, + "epoch": 0.5838784147749134, + "flos": 1360485504000.0, + "grad_norm": 0.02505289654727371, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.8083204, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.11132812, + "step": 3035, + "time_per_iteration": 4.8551225662231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086213, + "balance_loss_mlp": 1.05664897, + "epoch": 0.584070796460177, + "flos": 566942616576.0, + "grad_norm": 0.05971096981290547, + "language_loss": 0.82545829, + "learning_rate": 0.00038908668268020953, + "loss": 0.8363204, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.29516602, + "step": 3036, + "time_per_iteration": 2.6712634563446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084003, + "balance_loss_mlp": 1.05455875, + "epoch": 0.5842631781454406, + "flos": 611188623360.0, + "grad_norm": 0.06020630991976339, + "language_loss": 0.84750116, + "learning_rate": 0.00038878292364738097, + "loss": 0.85834116, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.29418945, + "step": 3037, + "time_per_iteration": 2.774688959121704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087202, + "balance_loss_mlp": 1.05785298, + "epoch": 0.5844555598307041, + "flos": 463148513280.0, + "grad_norm": 0.06330434972052289, + "language_loss": 0.87235534, + "learning_rate": 0.0003884792077928508, + "loss": 0.88322735, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.29345703, + "step": 3038, + "time_per_iteration": 2.511212110519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088952, + "balance_loss_mlp": 1.05957842, + "epoch": 0.5846479415159677, + "flos": 410005283328.0, + "grad_norm": 0.089824175631678, + "language_loss": 0.76556516, + "learning_rate": 0.0003881755352345322, + "loss": 0.77645469, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.29345703, + "step": 3039, + "time_per_iteration": 2.5297422409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108977, + "balance_loss_mlp": 1.06039691, + "epoch": 0.5848403232012312, + "flos": 491056542720.0, + "grad_norm": 0.05409760120739159, + "language_loss": 0.8652333, + "learning_rate": 0.0003878719060903207, + "loss": 0.87613106, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.29345703, + "step": 3040, + "time_per_iteration": 2.5606369972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108437, + "balance_loss_mlp": 1.05447245, + "epoch": 0.5850327048864948, + "flos": 584146335744.0, + "grad_norm": 0.07864155094531469, + "language_loss": 0.83092105, + "learning_rate": 0.0003875683204780961, + "loss": 0.84176469, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.29833984, + "step": 3041, + "time_per_iteration": 2.7069876194000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091497, + "balance_loss_mlp": 1.06128943, + "epoch": 0.5852250865717584, + "flos": 651253049856.0, + "grad_norm": 0.07084084705837652, + "language_loss": 0.85393965, + "learning_rate": 0.00038726477851572043, + "loss": 0.86485463, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.30175781, + "step": 3042, + "time_per_iteration": 2.785623788833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086169, + "balance_loss_mlp": 1.0566287, + "epoch": 0.5854174682570219, + "flos": 534332090880.0, + "grad_norm": 0.06883779110535396, + "language_loss": 0.80354905, + "learning_rate": 0.0003869612803210395, + "loss": 0.81441069, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.29541016, + "step": 3043, + "time_per_iteration": 2.635880708694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075998, + "balance_loss_mlp": 1.04643369, + "epoch": 0.5856098499422855, + "flos": 509501352960.0, + "grad_norm": 0.0705585022393511, + "language_loss": 0.83492166, + "learning_rate": 0.0003866578260118817, + "loss": 0.84568161, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.29541016, + "step": 3044, + "time_per_iteration": 2.58337664604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074571, + "balance_loss_mlp": 1.04491138, + "epoch": 0.5858022316275491, + "flos": 593619729408.0, + "grad_norm": 0.06598081480709424, + "language_loss": 0.83220106, + "learning_rate": 0.0003863544157060581, + "loss": 0.84294677, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.29614258, + "step": 3045, + "time_per_iteration": 2.66916561126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079474, + "balance_loss_mlp": 1.04998136, + "epoch": 0.5859946133128127, + "flos": 558829587456.0, + "grad_norm": 0.05207738102195899, + "language_loss": 0.82137144, + "learning_rate": 0.0003860510495213634, + "loss": 0.83216619, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.29492188, + "step": 3046, + "time_per_iteration": 2.8170437812805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04256272, + "epoch": 0.5861869949980761, + "flos": 553431647232.0, + "grad_norm": 0.07713217072038757, + "language_loss": 0.78373164, + "learning_rate": 0.0003857477275755746, + "loss": 0.79445338, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.29589844, + "step": 3047, + "time_per_iteration": 2.639801502227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077446, + "balance_loss_mlp": 1.04678559, + "epoch": 0.5863793766833397, + "flos": 718321886208.0, + "grad_norm": 0.05564403415338841, + "language_loss": 0.84011877, + "learning_rate": 0.00038544444998645167, + "loss": 0.8508932, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.30639648, + "step": 3048, + "time_per_iteration": 3.007289409637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076921, + "balance_loss_mlp": 1.04754782, + "epoch": 0.5865717583686033, + "flos": 472041354240.0, + "grad_norm": 0.06801965614795764, + "language_loss": 0.81586641, + "learning_rate": 0.00038514121687173767, + "loss": 0.8266356, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.29345703, + "step": 3049, + "time_per_iteration": 2.637277603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072965, + "balance_loss_mlp": 1.04397368, + "epoch": 0.5867641400538669, + "flos": 813143162880.0, + "grad_norm": 0.0576990751755922, + "language_loss": 0.81892288, + "learning_rate": 0.00038483802834915807, + "loss": 0.82965243, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.28979492, + "step": 3050, + "time_per_iteration": 2.975592613220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075399, + "balance_loss_mlp": 1.04607356, + "epoch": 0.5869565217391305, + "flos": 486285645312.0, + "grad_norm": 0.09338183491699942, + "language_loss": 0.78599441, + "learning_rate": 0.00038453488453642074, + "loss": 0.79674846, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.29296875, + "step": 3051, + "time_per_iteration": 2.668680429458618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_mlp": 1.04581618, + "epoch": 0.587148903424394, + "flos": 569104704000.0, + "grad_norm": 0.18186948375192843, + "language_loss": 0.86825669, + "learning_rate": 0.00038423178555121697, + "loss": 0.87900746, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.29223633, + "step": 3052, + "time_per_iteration": 2.7119386196136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080518, + "balance_loss_mlp": 1.05202711, + "epoch": 0.5873412851096576, + "flos": 746948680704.0, + "grad_norm": 0.05190046933032045, + "language_loss": 0.85228276, + "learning_rate": 0.00038392873151121994, + "loss": 0.86308795, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.28466797, + "step": 3053, + "time_per_iteration": 3.0532052516937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075316, + "balance_loss_mlp": 1.04615784, + "epoch": 0.5875336667949211, + "flos": 527882144256.0, + "grad_norm": 0.06073215036153007, + "language_loss": 0.830441, + "learning_rate": 0.0003836257225340859, + "loss": 0.84119415, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.29125977, + "step": 3054, + "time_per_iteration": 2.6791739463806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077784, + "balance_loss_mlp": 1.04922152, + "epoch": 0.5877260484801847, + "flos": 823799420928.0, + "grad_norm": 0.053654559033963406, + "language_loss": 0.82283098, + "learning_rate": 0.00038332275873745336, + "loss": 0.83360887, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.28564453, + "step": 3055, + "time_per_iteration": 3.0826737880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085261, + "balance_loss_mlp": 1.05646038, + "epoch": 0.5879184301654482, + "flos": 591325221888.0, + "grad_norm": 0.07874067829632751, + "language_loss": 0.82649648, + "learning_rate": 0.0003830198402389431, + "loss": 0.83734912, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.28759766, + "step": 3056, + "time_per_iteration": 2.71244215965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080719, + "balance_loss_mlp": 1.06755841, + "epoch": 0.5881108118507118, + "flos": 1544953955328.0, + "grad_norm": 0.03508304466376378, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78429663, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.13183594, + "step": 3057, + "time_per_iteration": 4.991718053817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087181, + "balance_loss_mlp": 1.05900002, + "epoch": 0.5883031935359754, + "flos": 489348380160.0, + "grad_norm": 0.0604575145753954, + "language_loss": 0.83162987, + "learning_rate": 0.0003824141396066855, + "loss": 0.84250164, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.28198242, + "step": 3058, + "time_per_iteration": 2.62410044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095213, + "balance_loss_mlp": 1.06605411, + "epoch": 0.588495575221239, + "flos": 582551654400.0, + "grad_norm": 0.05748148757470156, + "language_loss": 0.83195531, + "learning_rate": 0.000382111357708092, + "loss": 0.84290743, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.29125977, + "step": 3059, + "time_per_iteration": 2.741142511367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099933, + "balance_loss_mlp": 1.07113242, + "epoch": 0.5886879569065026, + "flos": 660751174656.0, + "grad_norm": 0.07210182052791281, + "language_loss": 0.83736324, + "learning_rate": 0.00038180862157792864, + "loss": 0.84836257, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.28808594, + "step": 3060, + "time_per_iteration": 2.8028531074523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095663, + "balance_loss_mlp": 1.06733847, + "epoch": 0.588880338591766, + "flos": 562392889344.0, + "grad_norm": 0.06185538750618477, + "language_loss": 0.82032192, + "learning_rate": 0.0003815059313337279, + "loss": 0.83127856, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.28295898, + "step": 3061, + "time_per_iteration": 2.661663055419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092581, + "balance_loss_mlp": 1.0641377, + "epoch": 0.5890727202770296, + "flos": 554451568128.0, + "grad_norm": 0.054152956568787894, + "language_loss": 0.78217703, + "learning_rate": 0.00038120328709300436, + "loss": 0.7931028, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.28466797, + "step": 3062, + "time_per_iteration": 2.8524019718170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_mlp": 1.0717572, + "epoch": 0.5892651019622932, + "flos": 655226606592.0, + "grad_norm": 0.07045144115382113, + "language_loss": 0.83619386, + "learning_rate": 0.0003809006889732549, + "loss": 0.84719896, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.28759766, + "step": 3063, + "time_per_iteration": 2.818297863006592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093698, + "balance_loss_mlp": 1.06554079, + "epoch": 0.5894574836475568, + "flos": 452970911232.0, + "grad_norm": 0.07166208719676233, + "language_loss": 0.87752122, + "learning_rate": 0.0003805981370919589, + "loss": 0.88845825, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.28173828, + "step": 3064, + "time_per_iteration": 2.523397445678711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091821, + "balance_loss_mlp": 1.06352103, + "epoch": 0.5896498653328203, + "flos": 518763750912.0, + "grad_norm": 0.052273370645306905, + "language_loss": 0.83554685, + "learning_rate": 0.0003802956315665771, + "loss": 0.84646511, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.28320312, + "step": 3065, + "time_per_iteration": 2.7017621994018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091683, + "balance_loss_mlp": 1.06428885, + "epoch": 0.5898422470180839, + "flos": 548793169920.0, + "grad_norm": 0.09115739101573021, + "language_loss": 0.81856883, + "learning_rate": 0.0003799931725145529, + "loss": 0.82948571, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.27416992, + "step": 3066, + "time_per_iteration": 2.6396725177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091771, + "balance_loss_mlp": 1.0635426, + "epoch": 0.5900346287033474, + "flos": 524046799872.0, + "grad_norm": 0.061744960378181175, + "language_loss": 0.85826695, + "learning_rate": 0.00037969076005331083, + "loss": 0.86918467, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.28271484, + "step": 3067, + "time_per_iteration": 2.7665817737579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05947697, + "epoch": 0.590227010388611, + "flos": 566893154304.0, + "grad_norm": 0.062191843713449865, + "language_loss": 0.87458771, + "learning_rate": 0.00037938839430025817, + "loss": 0.88547218, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.28930664, + "step": 3068, + "time_per_iteration": 2.645289897918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080639, + "balance_loss_mlp": 1.0527916, + "epoch": 0.5904193920738746, + "flos": 583053631488.0, + "grad_norm": 0.07692636502028646, + "language_loss": 0.85409123, + "learning_rate": 0.0003790860753727835, + "loss": 0.86489761, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.27856445, + "step": 3069, + "time_per_iteration": 2.831932544708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.05966043, + "epoch": 0.5906117737591381, + "flos": 529428773376.0, + "grad_norm": 0.05698566021180351, + "language_loss": 0.82950222, + "learning_rate": 0.00037878380338825766, + "loss": 0.84037948, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.28076172, + "step": 3070, + "time_per_iteration": 2.6856610774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094092, + "balance_loss_mlp": 1.06655455, + "epoch": 0.5908041554444017, + "flos": 683908655616.0, + "grad_norm": 0.05699607440456078, + "language_loss": 0.81377411, + "learning_rate": 0.00037848157846403287, + "loss": 0.82471496, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.27539062, + "step": 3071, + "time_per_iteration": 2.9222235679626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090999, + "balance_loss_mlp": 1.06291366, + "epoch": 0.5909965371296653, + "flos": 549719958528.0, + "grad_norm": 0.04993960868235579, + "language_loss": 0.8303259, + "learning_rate": 0.0003781794007174435, + "loss": 0.84123588, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.28076172, + "step": 3072, + "time_per_iteration": 2.8049426078796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_mlp": 1.03702164, + "epoch": 0.5911889188149289, + "flos": 1491544475136.0, + "grad_norm": 0.02139881306535856, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.7512219, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.10644531, + "step": 3073, + "time_per_iteration": 4.860798597335815 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05854619, + "epoch": 0.5913813005001923, + "flos": 487630043136.0, + "grad_norm": 0.0539637393269004, + "language_loss": 0.81219113, + "learning_rate": 0.0003775751872264152, + "loss": 0.8230564, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.28027344, + "step": 3074, + "time_per_iteration": 2.7820684909820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05267119, + "epoch": 0.5915736821854559, + "flos": 573034590720.0, + "grad_norm": 0.057314841017187666, + "language_loss": 0.87226552, + "learning_rate": 0.0003772731517165527, + "loss": 0.88307905, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.28686523, + "step": 3075, + "time_per_iteration": 2.8264849185943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082687, + "balance_loss_mlp": 1.05383801, + "epoch": 0.5917660638707195, + "flos": 789183959040.0, + "grad_norm": 0.06214529816255618, + "language_loss": 0.83813703, + "learning_rate": 0.0003769711638534784, + "loss": 0.84896386, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.28857422, + "step": 3076, + "time_per_iteration": 2.9739084243774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107611, + "balance_loss_mlp": 1.04769087, + "epoch": 0.5919584455559831, + "flos": 528487428096.0, + "grad_norm": 0.06330128127303343, + "language_loss": 0.78904676, + "learning_rate": 0.00037666922375443446, + "loss": 0.79980791, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.28417969, + "step": 3077, + "time_per_iteration": 2.611528158187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076959, + "balance_loss_mlp": 1.04815805, + "epoch": 0.5921508272412467, + "flos": 560320962048.0, + "grad_norm": 0.0824489675783013, + "language_loss": 0.81633419, + "learning_rate": 0.00037636733153664396, + "loss": 0.82710373, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.2878418, + "step": 3078, + "time_per_iteration": 2.830021619796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074589, + "balance_loss_mlp": 1.04547811, + "epoch": 0.5923432089265102, + "flos": 563008347648.0, + "grad_norm": 0.07220859459639119, + "language_loss": 0.79744393, + "learning_rate": 0.0003760654873173124, + "loss": 0.80818975, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.29077148, + "step": 3079, + "time_per_iteration": 2.7072205543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069705, + "balance_loss_mlp": 1.04047441, + "epoch": 0.5925355906117737, + "flos": 495488406528.0, + "grad_norm": 0.0611483797885387, + "language_loss": 0.81661952, + "learning_rate": 0.00037576369121362566, + "loss": 0.82731652, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.29174805, + "step": 3080, + "time_per_iteration": 2.6135458946228027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073309, + "balance_loss_mlp": 1.0437448, + "epoch": 0.5927279722970373, + "flos": 565940072448.0, + "grad_norm": 0.05261928263256693, + "language_loss": 0.81494981, + "learning_rate": 0.0003754619433427516, + "loss": 0.82568288, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.29516602, + "step": 3081, + "time_per_iteration": 2.935394763946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_mlp": 1.04502153, + "epoch": 0.5929203539823009, + "flos": 666674413056.0, + "grad_norm": 0.07109600442573788, + "language_loss": 0.77291781, + "learning_rate": 0.0003751602438218392, + "loss": 0.78366369, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.29516602, + "step": 3082, + "time_per_iteration": 2.762129306793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107369, + "balance_loss_mlp": 1.04410219, + "epoch": 0.5931127356675644, + "flos": 555484635648.0, + "grad_norm": 0.07081310094320947, + "language_loss": 0.83719951, + "learning_rate": 0.0003748585927680186, + "loss": 0.84793639, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.29589844, + "step": 3083, + "time_per_iteration": 2.6607072353363037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072302, + "balance_loss_mlp": 1.04126024, + "epoch": 0.593305117352828, + "flos": 534932992512.0, + "grad_norm": 0.09668658910416093, + "language_loss": 0.82859874, + "learning_rate": 0.00037455699029840086, + "loss": 0.83932179, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.31005859, + "step": 3084, + "time_per_iteration": 2.641989231109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069753, + "balance_loss_mlp": 1.04014122, + "epoch": 0.5934974990380916, + "flos": 593683748352.0, + "grad_norm": 0.04958887884439868, + "language_loss": 0.84485245, + "learning_rate": 0.0003742554365300787, + "loss": 0.85554999, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.2956543, + "step": 3085, + "time_per_iteration": 2.8070170879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074514, + "balance_loss_mlp": 1.0440923, + "epoch": 0.5936898807233552, + "flos": 712339011072.0, + "grad_norm": 0.06324229056117828, + "language_loss": 0.78341657, + "learning_rate": 0.0003739539315801255, + "loss": 0.79416168, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.30371094, + "step": 3086, + "time_per_iteration": 2.937530755996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076236, + "balance_loss_mlp": 1.04571867, + "epoch": 0.5938822624086187, + "flos": 391684128768.0, + "grad_norm": 0.06251001537840323, + "language_loss": 0.91790974, + "learning_rate": 0.000373652475565596, + "loss": 0.92867219, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.3046875, + "step": 3087, + "time_per_iteration": 2.484830856323242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072731, + "balance_loss_mlp": 1.0422616, + "epoch": 0.5940746440938822, + "flos": 480023373312.0, + "grad_norm": 0.06825336960690286, + "language_loss": 0.81144977, + "learning_rate": 0.00037335106860352587, + "loss": 0.82217705, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.3046875, + "step": 3088, + "time_per_iteration": 2.705796003341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079924, + "balance_loss_mlp": 1.04938293, + "epoch": 0.5942670257791458, + "flos": 483094872576.0, + "grad_norm": 0.05943406802659928, + "language_loss": 0.83409536, + "learning_rate": 0.00037304971081093146, + "loss": 0.84489465, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.30517578, + "step": 3089, + "time_per_iteration": 2.5424582958221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080838, + "balance_loss_mlp": 1.05015349, + "epoch": 0.5944594074644094, + "flos": 547656795648.0, + "grad_norm": 0.06149863143832335, + "language_loss": 0.80616403, + "learning_rate": 0.00037274840230481024, + "loss": 0.81697237, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.30664062, + "step": 3090, + "time_per_iteration": 2.7081451416015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073853, + "balance_loss_mlp": 1.04407477, + "epoch": 0.594651789149673, + "flos": 448943510016.0, + "grad_norm": 0.06332669517454644, + "language_loss": 0.79229522, + "learning_rate": 0.00037244714320214077, + "loss": 0.80303377, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.29736328, + "step": 3091, + "time_per_iteration": 2.5389420986175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.05082965, + "epoch": 0.5948441708349365, + "flos": 595969491456.0, + "grad_norm": 0.061471299239273844, + "language_loss": 0.83137572, + "learning_rate": 0.000372145933619882, + "loss": 0.84218347, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.29931641, + "step": 3092, + "time_per_iteration": 2.8748533725738525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076811, + "balance_loss_mlp": 1.04657912, + "epoch": 0.5950365525202, + "flos": 548251905024.0, + "grad_norm": 0.05871713315937548, + "language_loss": 0.82114685, + "learning_rate": 0.000371844773674974, + "loss": 0.8319149, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.30224609, + "step": 3093, + "time_per_iteration": 2.6465840339660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082816, + "balance_loss_mlp": 1.05346692, + "epoch": 0.5952289342054636, + "flos": 654385595904.0, + "grad_norm": 0.0642067113719601, + "language_loss": 0.81621695, + "learning_rate": 0.0003715436634843375, + "loss": 0.82704508, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.29345703, + "step": 3094, + "time_per_iteration": 2.9084014892578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079615, + "balance_loss_mlp": 1.05007505, + "epoch": 0.5954213158907272, + "flos": 603055245312.0, + "grad_norm": 0.04814703484993394, + "language_loss": 0.80545932, + "learning_rate": 0.00037124260316487355, + "loss": 0.81625545, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.29516602, + "step": 3095, + "time_per_iteration": 2.8632538318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075577, + "balance_loss_mlp": 1.04727709, + "epoch": 0.5956136975759908, + "flos": 486097970688.0, + "grad_norm": 0.060441576418101065, + "language_loss": 0.89618301, + "learning_rate": 0.0003709415928334643, + "loss": 0.90693879, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.28344727, + "step": 3096, + "time_per_iteration": 2.6276299953460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077506, + "balance_loss_mlp": 1.04813242, + "epoch": 0.5958060792612543, + "flos": 658462459392.0, + "grad_norm": 0.06311167084488892, + "language_loss": 0.80587751, + "learning_rate": 0.00037064063260697233, + "loss": 0.81665254, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.29345703, + "step": 3097, + "time_per_iteration": 2.893503427505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081151, + "balance_loss_mlp": 1.05151534, + "epoch": 0.5959984609465179, + "flos": 723201882624.0, + "grad_norm": 0.06048648768573219, + "language_loss": 0.78276408, + "learning_rate": 0.0003703397226022407, + "loss": 0.79357558, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.2956543, + "step": 3098, + "time_per_iteration": 3.0289156436920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_mlp": 1.02305758, + "epoch": 0.5961908426317815, + "flos": 1519010164224.0, + "grad_norm": 0.01734603550218104, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.76534188, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.11230469, + "step": 3099, + "time_per_iteration": 4.946389436721802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078376, + "balance_loss_mlp": 1.04978967, + "epoch": 0.596383224317045, + "flos": 532357678080.0, + "grad_norm": 0.05865367248717621, + "language_loss": 0.83124352, + "learning_rate": 0.0003697380537253339, + "loss": 0.84202731, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.28564453, + "step": 3100, + "time_per_iteration": 2.674445152282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083272, + "balance_loss_mlp": 1.05492401, + "epoch": 0.5965756060023086, + "flos": 590922169344.0, + "grad_norm": 0.050984632699602635, + "language_loss": 0.81265384, + "learning_rate": 0.0003694372950867471, + "loss": 0.82348651, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.28369141, + "step": 3101, + "time_per_iteration": 2.787538766860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075715, + "balance_loss_mlp": 1.04772449, + "epoch": 0.5967679876875721, + "flos": 861701760000.0, + "grad_norm": 0.05184746467501943, + "language_loss": 0.77182555, + "learning_rate": 0.0003691365871370976, + "loss": 0.78258264, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.2800293, + "step": 3102, + "time_per_iteration": 3.016934871673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080662, + "balance_loss_mlp": 1.05271935, + "epoch": 0.5969603693728357, + "flos": 553574241792.0, + "grad_norm": 0.06482068820490762, + "language_loss": 0.85340202, + "learning_rate": 0.00036883592999313093, + "loss": 0.8642087, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.27978516, + "step": 3103, + "time_per_iteration": 2.689819812774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079629, + "balance_loss_mlp": 1.05218673, + "epoch": 0.5971527510580993, + "flos": 718345207296.0, + "grad_norm": 0.06496745505902583, + "language_loss": 0.79311585, + "learning_rate": 0.0003685353237715722, + "loss": 0.8039121, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.27490234, + "step": 3104, + "time_per_iteration": 2.87333083152771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083254, + "balance_loss_mlp": 1.05504966, + "epoch": 0.5973451327433629, + "flos": 647324573184.0, + "grad_norm": 0.051730016495621756, + "language_loss": 0.8144263, + "learning_rate": 0.0003682347685891274, + "loss": 0.82525891, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.28222656, + "step": 3105, + "time_per_iteration": 2.888319730758667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080866, + "balance_loss_mlp": 1.05228007, + "epoch": 0.5975375144286263, + "flos": 721374446592.0, + "grad_norm": 0.060164631065922125, + "language_loss": 0.80393469, + "learning_rate": 0.0003679342645624822, + "loss": 0.8147434, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.28564453, + "step": 3106, + "time_per_iteration": 3.0317325592041016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079501, + "balance_loss_mlp": 1.0513438, + "epoch": 0.5977298961138899, + "flos": 750616699392.0, + "grad_norm": 0.057913897832382336, + "language_loss": 0.81649029, + "learning_rate": 0.0003676338118083025, + "loss": 0.82728529, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.28198242, + "step": 3107, + "time_per_iteration": 2.9762744903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_mlp": 1.05519295, + "epoch": 0.5979222777991535, + "flos": 530703360000.0, + "grad_norm": 0.05706871104479872, + "language_loss": 0.79560876, + "learning_rate": 0.0003673334104432347, + "loss": 0.80644441, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.28393555, + "step": 3108, + "time_per_iteration": 2.5976645946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.0530827, + "epoch": 0.5981146594844171, + "flos": 621459357696.0, + "grad_norm": 0.06092677674045173, + "language_loss": 0.83641863, + "learning_rate": 0.0003670330605839048, + "loss": 0.84723055, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.28125, + "step": 3109, + "time_per_iteration": 2.819420337677002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082632, + "balance_loss_mlp": 1.05480886, + "epoch": 0.5983070411696807, + "flos": 603309911040.0, + "grad_norm": 0.0537112811211955, + "language_loss": 0.76695013, + "learning_rate": 0.0003667327623469191, + "loss": 0.77777648, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.27832031, + "step": 3110, + "time_per_iteration": 2.766671657562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085165, + "balance_loss_mlp": 1.05753255, + "epoch": 0.5984994228549442, + "flos": 633187971072.0, + "grad_norm": 0.058546063064310164, + "language_loss": 0.77618361, + "learning_rate": 0.00036643251584886333, + "loss": 0.78703523, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.27661133, + "step": 3111, + "time_per_iteration": 2.789184808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077786, + "balance_loss_mlp": 1.05105901, + "epoch": 0.5986918045402078, + "flos": 525026022912.0, + "grad_norm": 0.054896589550954444, + "language_loss": 0.81872785, + "learning_rate": 0.00036613232120630393, + "loss": 0.82950568, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.26782227, + "step": 3112, + "time_per_iteration": 2.5881965160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081611, + "balance_loss_mlp": 1.05362022, + "epoch": 0.5988841862254713, + "flos": 482942103552.0, + "grad_norm": 0.07437964171487202, + "language_loss": 0.80355418, + "learning_rate": 0.00036583217853578643, + "loss": 0.81437027, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.27978516, + "step": 3113, + "time_per_iteration": 2.5409529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083019, + "balance_loss_mlp": 1.05457568, + "epoch": 0.5990765679107349, + "flos": 1139658674688.0, + "grad_norm": 0.06261379626444472, + "language_loss": 0.77366924, + "learning_rate": 0.000365532087953837, + "loss": 0.78449941, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.28442383, + "step": 3114, + "time_per_iteration": 3.6426267623901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076465, + "balance_loss_mlp": 1.04842734, + "epoch": 0.5992689495959984, + "flos": 516729701376.0, + "grad_norm": 0.08299057980597005, + "language_loss": 0.88937151, + "learning_rate": 0.00036523204957696065, + "loss": 0.90013611, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.28051758, + "step": 3115, + "time_per_iteration": 2.594581365585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083776, + "balance_loss_mlp": 1.05623841, + "epoch": 0.599461331281262, + "flos": 744288998400.0, + "grad_norm": 0.06140193987839019, + "language_loss": 0.80620509, + "learning_rate": 0.00036493206352164324, + "loss": 0.81704283, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.27612305, + "step": 3116, + "time_per_iteration": 2.922367572784424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076912, + "balance_loss_mlp": 1.04942214, + "epoch": 0.5996537129665256, + "flos": 592078892544.0, + "grad_norm": 0.05345315057842072, + "language_loss": 0.85505688, + "learning_rate": 0.000364632129904349, + "loss": 0.86582601, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.27514648, + "step": 3117, + "time_per_iteration": 2.765380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077238, + "balance_loss_mlp": 1.04884195, + "epoch": 0.5998460946517892, + "flos": 558735045120.0, + "grad_norm": 0.05997451129778301, + "language_loss": 0.77705157, + "learning_rate": 0.00036433224884152283, + "loss": 0.78782398, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.28393555, + "step": 3118, + "time_per_iteration": 2.714597225189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078485, + "balance_loss_mlp": 1.05032814, + "epoch": 0.6000384763370528, + "flos": 484325789184.0, + "grad_norm": 0.06439508839737945, + "language_loss": 0.77913392, + "learning_rate": 0.00036403242044958875, + "loss": 0.78991878, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.28173828, + "step": 3119, + "time_per_iteration": 2.5515971183776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107346, + "balance_loss_mlp": 1.04563642, + "epoch": 0.6002308580223162, + "flos": 596490407424.0, + "grad_norm": 0.05980235429893482, + "language_loss": 0.91155994, + "learning_rate": 0.0003637326448449507, + "loss": 0.9222945, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.27832031, + "step": 3120, + "time_per_iteration": 2.7075581550598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04651034, + "epoch": 0.6004232397075798, + "flos": 544879249920.0, + "grad_norm": 0.046913105653204425, + "language_loss": 0.86206967, + "learning_rate": 0.00036343292214399177, + "loss": 0.87282228, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.28735352, + "step": 3121, + "time_per_iteration": 2.8623263835906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076118, + "balance_loss_mlp": 1.04786551, + "epoch": 0.6006156213928434, + "flos": 629647990272.0, + "grad_norm": 0.08364408748252802, + "language_loss": 0.77170986, + "learning_rate": 0.00036313325246307456, + "loss": 0.782471, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.28271484, + "step": 3122, + "time_per_iteration": 2.8064393997192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077233, + "balance_loss_mlp": 1.04845548, + "epoch": 0.600808003078107, + "flos": 582043885056.0, + "grad_norm": 0.05351137159491715, + "language_loss": 0.86973262, + "learning_rate": 0.0003628336359185411, + "loss": 0.88050497, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.28759766, + "step": 3123, + "time_per_iteration": 2.701089859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074232, + "balance_loss_mlp": 1.04545498, + "epoch": 0.6010003847633705, + "flos": 634984883712.0, + "grad_norm": 0.061635029106804545, + "language_loss": 0.75553113, + "learning_rate": 0.000362534072626713, + "loss": 0.76627344, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.28759766, + "step": 3124, + "time_per_iteration": 2.7586216926574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076514, + "balance_loss_mlp": 1.04830909, + "epoch": 0.6011927664486341, + "flos": 718448514048.0, + "grad_norm": 0.05599212147105787, + "language_loss": 0.81046546, + "learning_rate": 0.00036223456270389093, + "loss": 0.82123059, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.2824707, + "step": 3125, + "time_per_iteration": 2.948882818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074418, + "balance_loss_mlp": 1.04442525, + "epoch": 0.6013851481338977, + "flos": 498782486016.0, + "grad_norm": 0.05186484782469995, + "language_loss": 0.81019723, + "learning_rate": 0.00036193510626635517, + "loss": 0.82094145, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.29980469, + "step": 3126, + "time_per_iteration": 2.671576499938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073529, + "balance_loss_mlp": 1.04410863, + "epoch": 0.6015775298191612, + "flos": 749266509312.0, + "grad_norm": 0.05950376235873218, + "language_loss": 0.81565017, + "learning_rate": 0.0003616357034303649, + "loss": 0.82638544, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.29370117, + "step": 3127, + "time_per_iteration": 2.9371449947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074144, + "balance_loss_mlp": 1.04541481, + "epoch": 0.6017699115044248, + "flos": 592764162048.0, + "grad_norm": 0.048316094410884414, + "language_loss": 0.78690076, + "learning_rate": 0.0003613363543121584, + "loss": 0.79764223, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.28735352, + "step": 3128, + "time_per_iteration": 2.873584508895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076962, + "balance_loss_mlp": 1.04766035, + "epoch": 0.6019622931896883, + "flos": 514839656448.0, + "grad_norm": 0.05627549899999149, + "language_loss": 0.8521632, + "learning_rate": 0.00036103705902795357, + "loss": 0.8629328, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.29248047, + "step": 3129, + "time_per_iteration": 2.721329689025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074169, + "balance_loss_mlp": 1.04434288, + "epoch": 0.6021546748749519, + "flos": 490219914240.0, + "grad_norm": 0.06933558951012796, + "language_loss": 0.7955035, + "learning_rate": 0.0003607378176939471, + "loss": 0.80624521, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.29785156, + "step": 3130, + "time_per_iteration": 2.672825574874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070174, + "balance_loss_mlp": 1.04118252, + "epoch": 0.6023470565602155, + "flos": 540763098624.0, + "grad_norm": 0.07276264365929157, + "language_loss": 0.82265472, + "learning_rate": 0.00036043863042631465, + "loss": 0.8333565, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.29003906, + "step": 3131, + "time_per_iteration": 2.724228858947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.03918386, + "epoch": 0.6025394382454791, + "flos": 844660984320.0, + "grad_norm": 0.06054022798216566, + "language_loss": 0.76351178, + "learning_rate": 0.00036013949734121133, + "loss": 0.77419853, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.29467773, + "step": 3132, + "time_per_iteration": 3.1145389080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068619, + "balance_loss_mlp": 1.03831553, + "epoch": 0.6027318199307425, + "flos": 576903430656.0, + "grad_norm": 0.061447218218141524, + "language_loss": 0.82303023, + "learning_rate": 0.00035984041855477043, + "loss": 0.83371639, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.30249023, + "step": 3133, + "time_per_iteration": 2.779906749725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_mlp": 1.01274288, + "epoch": 0.6029242016160061, + "flos": 1470160585728.0, + "grad_norm": 0.015590695702157922, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79734081, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.11425781, + "step": 3134, + "time_per_iteration": 4.933319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064388, + "balance_loss_mlp": 1.03503895, + "epoch": 0.6031165833012697, + "flos": 480486062592.0, + "grad_norm": 0.05335614021413427, + "language_loss": 0.79509521, + "learning_rate": 0.00035924242434230637, + "loss": 0.80573905, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.29321289, + "step": 3135, + "time_per_iteration": 2.6558902263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065788, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6033089649865333, + "flos": 499220444160.0, + "grad_norm": 0.07899589356076418, + "language_loss": 0.78020877, + "learning_rate": 0.00035894350914844516, + "loss": 0.79086667, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.28881836, + "step": 3136, + "time_per_iteration": 2.631028175354004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_mlp": 1.03927457, + "epoch": 0.6035013466717969, + "flos": 556337230848.0, + "grad_norm": 0.06724246097152477, + "language_loss": 0.8242653, + "learning_rate": 0.0003586446487175703, + "loss": 0.83495319, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.29516602, + "step": 3137, + "time_per_iteration": 2.6988327503204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068468, + "balance_loss_mlp": 1.03866601, + "epoch": 0.6036937283570604, + "flos": 594536343552.0, + "grad_norm": 0.053597642089091506, + "language_loss": 0.85091925, + "learning_rate": 0.0003583458431657099, + "loss": 0.86160386, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.29760742, + "step": 3138, + "time_per_iteration": 2.761122941970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067135, + "balance_loss_mlp": 1.03735673, + "epoch": 0.603886110042324, + "flos": 540684523008.0, + "grad_norm": 0.06925518043051447, + "language_loss": 0.83323741, + "learning_rate": 0.00035804709260887056, + "loss": 0.84390879, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.29711914, + "step": 3139, + "time_per_iteration": 2.664776563644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069913, + "balance_loss_mlp": 1.04013443, + "epoch": 0.6040784917275875, + "flos": 518315618304.0, + "grad_norm": 0.05868516129691736, + "language_loss": 0.894665, + "learning_rate": 0.0003577483971630373, + "loss": 0.90536416, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.29760742, + "step": 3140, + "time_per_iteration": 2.659006118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069941, + "balance_loss_mlp": 1.03982854, + "epoch": 0.6042708734128511, + "flos": 660436872192.0, + "grad_norm": 0.0462994946970423, + "language_loss": 0.85074717, + "learning_rate": 0.00035744975694417414, + "loss": 0.86144656, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.30078125, + "step": 3141, + "time_per_iteration": 2.9323952198028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073401, + "balance_loss_mlp": 1.04438555, + "epoch": 0.6044632550981146, + "flos": 572035018752.0, + "grad_norm": 0.06410322202016926, + "language_loss": 0.82079303, + "learning_rate": 0.00035715117206822344, + "loss": 0.83152711, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.28979492, + "step": 3142, + "time_per_iteration": 2.8329904079437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070447, + "balance_loss_mlp": 1.04145527, + "epoch": 0.6046556367833782, + "flos": 546420086784.0, + "grad_norm": 0.060439068049678774, + "language_loss": 0.80993617, + "learning_rate": 0.0003568526426511065, + "loss": 0.82064068, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.28979492, + "step": 3143, + "time_per_iteration": 2.695185899734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_mlp": 1.0432328, + "epoch": 0.6048480184686418, + "flos": 776505235968.0, + "grad_norm": 0.06755719072358204, + "language_loss": 0.82702982, + "learning_rate": 0.000356554168808722, + "loss": 0.83775228, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.29003906, + "step": 3144, + "time_per_iteration": 2.9742469787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.04537654, + "epoch": 0.6050404001539054, + "flos": 656837254656.0, + "grad_norm": 0.05422673748867178, + "language_loss": 0.84676063, + "learning_rate": 0.00035625575065694837, + "loss": 0.85749412, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.2800293, + "step": 3145, + "time_per_iteration": 2.8367791175842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077792, + "balance_loss_mlp": 1.04934883, + "epoch": 0.605232781839169, + "flos": 548710212096.0, + "grad_norm": 0.05280732268922785, + "language_loss": 0.77452278, + "learning_rate": 0.0003559573883116415, + "loss": 0.78530073, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.28466797, + "step": 3146, + "time_per_iteration": 2.701388120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075301, + "balance_loss_mlp": 1.04702449, + "epoch": 0.6054251635244324, + "flos": 605093677056.0, + "grad_norm": 0.04869973207051341, + "language_loss": 0.85634321, + "learning_rate": 0.00035565908188863604, + "loss": 0.86709619, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.28271484, + "step": 3147, + "time_per_iteration": 2.898590087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076445, + "balance_loss_mlp": 1.04831183, + "epoch": 0.605617545209696, + "flos": 613398763008.0, + "grad_norm": 0.06327080100476104, + "language_loss": 0.79599166, + "learning_rate": 0.00035536083150374464, + "loss": 0.80675614, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.28149414, + "step": 3148, + "time_per_iteration": 2.771320343017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102059, + "balance_loss_mlp": 1.00905097, + "epoch": 0.6058099268949596, + "flos": 1497477888000.0, + "grad_norm": 0.011512942764516735, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75768542, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.11523438, + "step": 3149, + "time_per_iteration": 4.814287185668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077389, + "balance_loss_mlp": 1.04918396, + "epoch": 0.6060023085802232, + "flos": 670170723840.0, + "grad_norm": 0.05840631409964381, + "language_loss": 0.85528827, + "learning_rate": 0.0003547644993114475, + "loss": 0.86606216, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.28198242, + "step": 3150, + "time_per_iteration": 2.8378889560699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107623, + "balance_loss_mlp": 1.04795372, + "epoch": 0.6061946902654868, + "flos": 605885225472.0, + "grad_norm": 0.06870733473036895, + "language_loss": 0.7981267, + "learning_rate": 0.00035446641773555806, + "loss": 0.80888903, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.28295898, + "step": 3151, + "time_per_iteration": 2.7372798919677734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077461, + "balance_loss_mlp": 1.04916036, + "epoch": 0.6063870719507503, + "flos": 557568147456.0, + "grad_norm": 0.05718786699526154, + "language_loss": 0.86853182, + "learning_rate": 0.000354168392660816, + "loss": 0.87930644, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.28344727, + "step": 3152, + "time_per_iteration": 2.7871758937835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073815, + "balance_loss_mlp": 1.04558635, + "epoch": 0.6065794536360138, + "flos": 556874113536.0, + "grad_norm": 0.05898712641381182, + "language_loss": 0.82702786, + "learning_rate": 0.0003538704242029252, + "loss": 0.83776605, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.28222656, + "step": 3153, + "time_per_iteration": 2.700695753097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075106, + "balance_loss_mlp": 1.0467577, + "epoch": 0.6067718353212774, + "flos": 689836276224.0, + "grad_norm": 0.06128602508798912, + "language_loss": 0.7773366, + "learning_rate": 0.0003535725124775672, + "loss": 0.78808761, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.28344727, + "step": 3154, + "time_per_iteration": 2.8570618629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076573, + "balance_loss_mlp": 1.0478195, + "epoch": 0.606964217006541, + "flos": 521531122176.0, + "grad_norm": 0.055885875690184536, + "language_loss": 0.86403567, + "learning_rate": 0.00035327465760040126, + "loss": 0.8748014, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.28710938, + "step": 3155, + "time_per_iteration": 2.6846063137054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072066, + "balance_loss_mlp": 1.04281223, + "epoch": 0.6071565986918045, + "flos": 641267504640.0, + "grad_norm": 0.06048889768089712, + "language_loss": 0.84499794, + "learning_rate": 0.00035297685968706526, + "loss": 0.85571855, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.29223633, + "step": 3156, + "time_per_iteration": 2.7771387100219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072214, + "balance_loss_mlp": 1.04453337, + "epoch": 0.6073489803770681, + "flos": 560315169792.0, + "grad_norm": 0.06250295268242392, + "language_loss": 0.83014715, + "learning_rate": 0.00035267911885317454, + "loss": 0.84086931, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.27709961, + "step": 3157, + "time_per_iteration": 2.656357765197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074037, + "balance_loss_mlp": 1.0442822, + "epoch": 0.6075413620623317, + "flos": 585810828288.0, + "grad_norm": 0.057378940891661595, + "language_loss": 0.81611866, + "learning_rate": 0.0003523814352143222, + "loss": 0.826859, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.29711914, + "step": 3158, + "time_per_iteration": 2.830617904663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077727, + "balance_loss_mlp": 1.04883063, + "epoch": 0.6077337437475953, + "flos": 630523906560.0, + "grad_norm": 0.0599841254590138, + "language_loss": 0.90816242, + "learning_rate": 0.00035208380888607937, + "loss": 0.91893965, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.28881836, + "step": 3159, + "time_per_iteration": 2.8117706775665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009022, + "balance_loss_mlp": 0.99786437, + "epoch": 0.6079261254328588, + "flos": 1467726455808.0, + "grad_norm": 0.007967889265398313, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80471009, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.11181641, + "step": 3160, + "time_per_iteration": 4.8633644580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009246, + "balance_loss_mlp": 0.998088, + "epoch": 0.6081185071181223, + "flos": 1522233022464.0, + "grad_norm": 0.00797101191785885, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76701474, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.11181641, + "step": 3161, + "time_per_iteration": 5.046196460723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075203, + "balance_loss_mlp": 1.04611611, + "epoch": 0.6083108888033859, + "flos": 556041867264.0, + "grad_norm": 0.04533613724441275, + "language_loss": 0.81858671, + "learning_rate": 0.00035119127492038446, + "loss": 0.82933867, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.29077148, + "step": 3162, + "time_per_iteration": 2.815852403640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075143, + "balance_loss_mlp": 1.0469625, + "epoch": 0.6085032704886495, + "flos": 840819847680.0, + "grad_norm": 0.053216451363019494, + "language_loss": 0.82787645, + "learning_rate": 0.00035089387898984436, + "loss": 0.83862782, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.28198242, + "step": 3163, + "time_per_iteration": 3.059666156768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075751, + "balance_loss_mlp": 1.04683065, + "epoch": 0.6086956521739131, + "flos": 684493590528.0, + "grad_norm": 0.06412835192713194, + "language_loss": 0.81799018, + "learning_rate": 0.0003505965409474343, + "loss": 0.82874769, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.28881836, + "step": 3164, + "time_per_iteration": 2.8909780979156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072573, + "balance_loss_mlp": 1.04374802, + "epoch": 0.6088880338591766, + "flos": 535533894144.0, + "grad_norm": 0.050432732030132946, + "language_loss": 0.86329949, + "learning_rate": 0.0003502992609085913, + "loss": 0.87402523, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.28808594, + "step": 3165, + "time_per_iteration": 2.66687273979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074674, + "balance_loss_mlp": 1.04513407, + "epoch": 0.6090804155444401, + "flos": 731197048320.0, + "grad_norm": 0.053888239650619583, + "language_loss": 0.82507217, + "learning_rate": 0.00035000203898872954, + "loss": 0.83581889, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.29516602, + "step": 3166, + "time_per_iteration": 3.05118989944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072174, + "balance_loss_mlp": 1.04303908, + "epoch": 0.6092727972297037, + "flos": 698708768256.0, + "grad_norm": 0.06623841355558525, + "language_loss": 0.84253997, + "learning_rate": 0.0003497048753032406, + "loss": 0.85326171, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.29125977, + "step": 3167, + "time_per_iteration": 2.87467885017395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074156, + "balance_loss_mlp": 1.04473543, + "epoch": 0.6094651789149673, + "flos": 1051515869184.0, + "grad_norm": 0.05347521996771115, + "language_loss": 0.80754191, + "learning_rate": 0.000349407769967494, + "loss": 0.81828344, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.29394531, + "step": 3168, + "time_per_iteration": 3.3934104442596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074195, + "balance_loss_mlp": 1.04546547, + "epoch": 0.6096575606002309, + "flos": 502834618368.0, + "grad_norm": 0.10902305889023324, + "language_loss": 0.84663367, + "learning_rate": 0.0003491107230968361, + "loss": 0.85737562, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.28710938, + "step": 3169, + "time_per_iteration": 2.6888718605041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_mlp": 1.04351735, + "epoch": 0.6098499422854944, + "flos": 585339374592.0, + "grad_norm": 0.05661622017927931, + "language_loss": 0.81418574, + "learning_rate": 0.00034881373480659085, + "loss": 0.82490849, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.28735352, + "step": 3170, + "time_per_iteration": 2.820013999938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073009, + "balance_loss_mlp": 1.043993, + "epoch": 0.610042323970758, + "flos": 468968444928.0, + "grad_norm": 0.0573564735722831, + "language_loss": 0.78202963, + "learning_rate": 0.0003485168052120594, + "loss": 0.79275972, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.2902832, + "step": 3171, + "time_per_iteration": 2.5298008918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108136, + "balance_loss_mlp": 1.05255914, + "epoch": 0.6102347056560216, + "flos": 513923042304.0, + "grad_norm": 0.06128596263952344, + "language_loss": 0.79907572, + "learning_rate": 0.00034821993442851973, + "loss": 0.80988932, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.28808594, + "step": 3172, + "time_per_iteration": 2.5819344520568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075018, + "balance_loss_mlp": 1.0474807, + "epoch": 0.6104270873412851, + "flos": 468776388096.0, + "grad_norm": 0.06156265055034652, + "language_loss": 0.82331789, + "learning_rate": 0.00034792312257122735, + "loss": 0.83406806, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.27612305, + "step": 3173, + "time_per_iteration": 2.621645212173462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070505, + "balance_loss_mlp": 1.04187059, + "epoch": 0.6106194690265486, + "flos": 549610859520.0, + "grad_norm": 0.059872220515584544, + "language_loss": 0.80486125, + "learning_rate": 0.00034762636975541506, + "loss": 0.8155663, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.28613281, + "step": 3174, + "time_per_iteration": 2.6323647499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074186, + "balance_loss_mlp": 1.0451467, + "epoch": 0.6108118507118122, + "flos": 472602968064.0, + "grad_norm": 0.05798479282712576, + "language_loss": 0.81059682, + "learning_rate": 0.0003473296760962923, + "loss": 0.82133865, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.2902832, + "step": 3175, + "time_per_iteration": 2.679593324661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018524, + "balance_loss_mlp": 1.007128, + "epoch": 0.6110042323970758, + "flos": 1444416205824.0, + "grad_norm": 0.01318817873369303, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79552263, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.11376953, + "step": 3176, + "time_per_iteration": 4.708170652389526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075937, + "balance_loss_mlp": 1.04811323, + "epoch": 0.6111966140823394, + "flos": 793807879680.0, + "grad_norm": 0.06988374073618883, + "language_loss": 0.81172955, + "learning_rate": 0.00034673646670883976, + "loss": 0.82248896, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.27832031, + "step": 3177, + "time_per_iteration": 3.0760982036590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018443, + "balance_loss_mlp": 1.00714159, + "epoch": 0.611388995767603, + "flos": 1556800432128.0, + "grad_norm": 0.012123406085696703, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76733464, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.11279297, + "step": 3178, + "time_per_iteration": 5.047900199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077367, + "balance_loss_mlp": 1.04909086, + "epoch": 0.6115813774528664, + "flos": 711841416192.0, + "grad_norm": 0.06496983177026339, + "language_loss": 0.81433582, + "learning_rate": 0.0003461434953300865, + "loss": 0.82510948, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.28271484, + "step": 3179, + "time_per_iteration": 2.934129476547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.0462321, + "epoch": 0.61177375913813, + "flos": 683963910144.0, + "grad_norm": 0.054564857541299305, + "language_loss": 0.81309831, + "learning_rate": 0.0003458470991817515, + "loss": 0.82384884, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.28808594, + "step": 3180, + "time_per_iteration": 2.9692420959472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.05249786, + "epoch": 0.6119661408233936, + "flos": 511411746816.0, + "grad_norm": 0.056066758208496104, + "language_loss": 0.84904051, + "learning_rate": 0.0003455507628808802, + "loss": 0.85985035, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.28491211, + "step": 3181, + "time_per_iteration": 2.613642692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107824, + "balance_loss_mlp": 1.04986787, + "epoch": 0.6121585225086572, + "flos": 556548226560.0, + "grad_norm": 0.07624020954576015, + "language_loss": 0.84440458, + "learning_rate": 0.00034525448654252076, + "loss": 0.855187, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.28369141, + "step": 3182, + "time_per_iteration": 2.6653053760528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074575, + "balance_loss_mlp": 1.04701424, + "epoch": 0.6123509041939207, + "flos": 561585374208.0, + "grad_norm": 0.06355946830094689, + "language_loss": 0.82891977, + "learning_rate": 0.0003449582702816976, + "loss": 0.83966547, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.27587891, + "step": 3183, + "time_per_iteration": 2.6951351165771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.05404711, + "epoch": 0.6125432858791843, + "flos": 557789317632.0, + "grad_norm": 0.056298205322627685, + "language_loss": 0.82360494, + "learning_rate": 0.0003446621142134122, + "loss": 0.83442801, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.28271484, + "step": 3184, + "time_per_iteration": 2.6690409183502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077624, + "balance_loss_mlp": 1.04958582, + "epoch": 0.6127356675644479, + "flos": 414796529664.0, + "grad_norm": 0.06604074574998081, + "language_loss": 0.84192419, + "learning_rate": 0.0003443660184526424, + "loss": 0.85270047, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.28051758, + "step": 3185, + "time_per_iteration": 2.4451961517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078036, + "balance_loss_mlp": 1.04949737, + "epoch": 0.6129280492497114, + "flos": 603547047936.0, + "grad_norm": 0.0548279179658957, + "language_loss": 0.86286807, + "learning_rate": 0.0003440699831143429, + "loss": 0.87364841, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.28515625, + "step": 3186, + "time_per_iteration": 2.7583630084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078194, + "balance_loss_mlp": 1.04989386, + "epoch": 0.613120430934975, + "flos": 519492690432.0, + "grad_norm": 0.05592702907616355, + "language_loss": 0.81846583, + "learning_rate": 0.0003437740083134449, + "loss": 0.82924777, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.28344727, + "step": 3187, + "time_per_iteration": 2.6769111156463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107819, + "balance_loss_mlp": 1.05053306, + "epoch": 0.6133128126202385, + "flos": 510835576320.0, + "grad_norm": 0.07534478934925966, + "language_loss": 0.82936466, + "learning_rate": 0.00034347809416485574, + "loss": 0.84014654, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.27709961, + "step": 3188, + "time_per_iteration": 2.579110622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079083, + "balance_loss_mlp": 1.05052042, + "epoch": 0.6135051943055021, + "flos": 607264528896.0, + "grad_norm": 0.05208625136089098, + "language_loss": 0.8201586, + "learning_rate": 0.0003431822407834597, + "loss": 0.83094943, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.28588867, + "step": 3189, + "time_per_iteration": 2.800846815109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079618, + "balance_loss_mlp": 1.05084062, + "epoch": 0.6136975759907657, + "flos": 1159750600704.0, + "grad_norm": 0.06054576051189374, + "language_loss": 0.84436607, + "learning_rate": 0.00034288644828411706, + "loss": 0.85516232, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.28735352, + "step": 3190, + "time_per_iteration": 3.459338426589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05513883, + "epoch": 0.6138899576760293, + "flos": 706631150592.0, + "grad_norm": 0.0818478077901872, + "language_loss": 0.75477004, + "learning_rate": 0.0003425907167816649, + "loss": 0.7656135, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.29150391, + "step": 3191, + "time_per_iteration": 2.874662399291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.05148816, + "epoch": 0.6140823393612928, + "flos": 586151271936.0, + "grad_norm": 0.06137447834473829, + "language_loss": 0.84648186, + "learning_rate": 0.00034229504639091623, + "loss": 0.85728073, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.28393555, + "step": 3192, + "time_per_iteration": 2.768174171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078463, + "balance_loss_mlp": 1.04906654, + "epoch": 0.6142747210465563, + "flos": 803759929344.0, + "grad_norm": 0.05748161960079173, + "language_loss": 0.80287862, + "learning_rate": 0.0003419994372266606, + "loss": 0.81366324, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.29345703, + "step": 3193, + "time_per_iteration": 3.1592228412628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079327, + "balance_loss_mlp": 1.05054975, + "epoch": 0.6144671027318199, + "flos": 529158140928.0, + "grad_norm": 0.04575030988697244, + "language_loss": 0.81596744, + "learning_rate": 0.00034170388940366335, + "loss": 0.82676071, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.2878418, + "step": 3194, + "time_per_iteration": 2.707101345062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078794, + "balance_loss_mlp": 1.05011201, + "epoch": 0.6146594844170835, + "flos": 805054864896.0, + "grad_norm": 0.05557650302359453, + "language_loss": 0.79986775, + "learning_rate": 0.0003414084030366667, + "loss": 0.81065571, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.28686523, + "step": 3195, + "time_per_iteration": 3.086768388748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070986, + "balance_loss_mlp": 1.04118395, + "epoch": 0.6148518661023471, + "flos": 501431993856.0, + "grad_norm": 0.05715110105949097, + "language_loss": 0.82949638, + "learning_rate": 0.0003411129782403883, + "loss": 0.84020627, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.29760742, + "step": 3196, + "time_per_iteration": 2.65775203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078782, + "balance_loss_mlp": 1.04926562, + "epoch": 0.6150442477876106, + "flos": 510436905984.0, + "grad_norm": 0.06094401033818373, + "language_loss": 0.8473599, + "learning_rate": 0.0003408176151295225, + "loss": 0.8581478, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.29516602, + "step": 3197, + "time_per_iteration": 2.6118876934051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076412, + "balance_loss_mlp": 1.04806376, + "epoch": 0.6152366294728742, + "flos": 526758916608.0, + "grad_norm": 0.056153389528983695, + "language_loss": 0.7719816, + "learning_rate": 0.00034052231381873944, + "loss": 0.78274572, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.28320312, + "step": 3198, + "time_per_iteration": 2.6228411197662354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079605, + "balance_loss_mlp": 1.05066109, + "epoch": 0.6154290111581378, + "flos": 473055482880.0, + "grad_norm": 0.07032084774443613, + "language_loss": 0.84981108, + "learning_rate": 0.00034022707442268494, + "loss": 0.86060715, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.28955078, + "step": 3199, + "time_per_iteration": 2.6281561851501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081064, + "balance_loss_mlp": 1.05204892, + "epoch": 0.6156213928434013, + "flos": 550542030336.0, + "grad_norm": 0.04792292414356855, + "language_loss": 0.81849301, + "learning_rate": 0.0003399318970559813, + "loss": 0.82930362, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.28979492, + "step": 3200, + "time_per_iteration": 2.848755121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083137, + "balance_loss_mlp": 1.05426502, + "epoch": 0.6158137745286649, + "flos": 750587586048.0, + "grad_norm": 0.06290240151644533, + "language_loss": 0.8428275, + "learning_rate": 0.00033963678183322656, + "loss": 0.85365885, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.28833008, + "step": 3201, + "time_per_iteration": 3.027029275894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083363, + "balance_loss_mlp": 1.05396593, + "epoch": 0.6160061562139284, + "flos": 555544272384.0, + "grad_norm": 0.050860435501305326, + "language_loss": 0.8262167, + "learning_rate": 0.0003393417288689945, + "loss": 0.83705032, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.29370117, + "step": 3202, + "time_per_iteration": 2.6697185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_mlp": 1.05422282, + "epoch": 0.616198537899192, + "flos": 741856278528.0, + "grad_norm": 0.07354923140459588, + "language_loss": 0.75762349, + "learning_rate": 0.00033904673827783504, + "loss": 0.76847088, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.3046875, + "step": 3203, + "time_per_iteration": 2.9294135570526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083321, + "balance_loss_mlp": 1.05423403, + "epoch": 0.6163909195844556, + "flos": 478569876480.0, + "grad_norm": 0.060707114262551334, + "language_loss": 0.8162061, + "learning_rate": 0.00033875181017427357, + "loss": 0.82703936, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.2902832, + "step": 3204, + "time_per_iteration": 2.595367193222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078594, + "balance_loss_mlp": 1.04924512, + "epoch": 0.6165833012697192, + "flos": 531231478272.0, + "grad_norm": 0.054344968838841615, + "language_loss": 0.80957687, + "learning_rate": 0.00033845694467281133, + "loss": 0.82036287, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.29321289, + "step": 3205, + "time_per_iteration": 2.846841812133789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081783, + "balance_loss_mlp": 1.0531013, + "epoch": 0.6167756829549826, + "flos": 807384278016.0, + "grad_norm": 0.06726799818780427, + "language_loss": 0.83033085, + "learning_rate": 0.00033816214188792516, + "loss": 0.84114861, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.28686523, + "step": 3206, + "time_per_iteration": 3.1646995544433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_mlp": 1.05008507, + "epoch": 0.6169680646402462, + "flos": 488683459584.0, + "grad_norm": 0.05376278097292006, + "language_loss": 0.8520205, + "learning_rate": 0.00033786740193406784, + "loss": 0.86280841, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.28686523, + "step": 3207, + "time_per_iteration": 2.577228307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075976, + "balance_loss_mlp": 1.04767549, + "epoch": 0.6171604463255098, + "flos": 618643934208.0, + "grad_norm": 0.056191099229546404, + "language_loss": 0.81319952, + "learning_rate": 0.00033757272492566736, + "loss": 0.82395929, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.28320312, + "step": 3208, + "time_per_iteration": 2.8721108436584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078583, + "balance_loss_mlp": 1.05013978, + "epoch": 0.6173528280107734, + "flos": 528600909312.0, + "grad_norm": 0.04893199519437597, + "language_loss": 0.87034678, + "learning_rate": 0.0003372781109771278, + "loss": 0.8811326, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.28442383, + "step": 3209, + "time_per_iteration": 2.7287070751190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077966, + "balance_loss_mlp": 1.04907, + "epoch": 0.617545209696037, + "flos": 596293968384.0, + "grad_norm": 0.04879640412841063, + "language_loss": 0.76108795, + "learning_rate": 0.0003369835602028281, + "loss": 0.77186757, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.28881836, + "step": 3210, + "time_per_iteration": 2.8439886569976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078621, + "balance_loss_mlp": 1.04924726, + "epoch": 0.6177375913813005, + "flos": 474848013312.0, + "grad_norm": 0.055192186653408186, + "language_loss": 0.79211128, + "learning_rate": 0.0003366890727171232, + "loss": 0.80289745, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.29345703, + "step": 3211, + "time_per_iteration": 2.6932919025421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082394, + "balance_loss_mlp": 1.0535692, + "epoch": 0.617929973066564, + "flos": 529546636800.0, + "grad_norm": 0.07153817197124837, + "language_loss": 0.78408551, + "learning_rate": 0.00033639464863434313, + "loss": 0.79490948, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.2878418, + "step": 3212, + "time_per_iteration": 2.6900713443756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_mlp": 1.02929533, + "epoch": 0.6181223547518276, + "flos": 1419361477632.0, + "grad_norm": 0.01617816391785494, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79482591, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.10839844, + "step": 3213, + "time_per_iteration": 4.7103211879730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077859, + "balance_loss_mlp": 1.04979765, + "epoch": 0.6183147364370912, + "flos": 739976408064.0, + "grad_norm": 0.0586976807946241, + "language_loss": 0.79730934, + "learning_rate": 0.00033580599113475543, + "loss": 0.80808794, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.28076172, + "step": 3214, + "time_per_iteration": 2.972890853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076015, + "balance_loss_mlp": 1.04759574, + "epoch": 0.6185071181223547, + "flos": 381442507776.0, + "grad_norm": 0.06601952737269029, + "language_loss": 0.85816491, + "learning_rate": 0.00033551175794648507, + "loss": 0.86892509, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.28417969, + "step": 3215, + "time_per_iteration": 2.456907033920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_mlp": 1.04439735, + "epoch": 0.6186994998076183, + "flos": 463109225472.0, + "grad_norm": 0.062254504168561625, + "language_loss": 0.8188296, + "learning_rate": 0.00033521758861821365, + "loss": 0.82955682, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.28344727, + "step": 3216, + "time_per_iteration": 2.580777406692505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_mlp": 1.04071391, + "epoch": 0.6188918814928819, + "flos": 485029997568.0, + "grad_norm": 0.04883960048827372, + "language_loss": 0.88878882, + "learning_rate": 0.0003349234832641479, + "loss": 0.89947987, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.28417969, + "step": 3217, + "time_per_iteration": 2.5541629791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074942, + "balance_loss_mlp": 1.04635608, + "epoch": 0.6190842631781455, + "flos": 656985641472.0, + "grad_norm": 0.06561076665766134, + "language_loss": 0.80879915, + "learning_rate": 0.00033462944199846975, + "loss": 0.81954861, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.28540039, + "step": 3218, + "time_per_iteration": 3.062703847885132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077215, + "balance_loss_mlp": 1.04848528, + "epoch": 0.619276644863409, + "flos": 403388011008.0, + "grad_norm": 0.06502548187197098, + "language_loss": 0.8618629, + "learning_rate": 0.00033433546493533606, + "loss": 0.87263501, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.28710938, + "step": 3219, + "time_per_iteration": 2.4797823429107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072308, + "balance_loss_mlp": 1.04443645, + "epoch": 0.6194690265486725, + "flos": 582807730176.0, + "grad_norm": 0.06173556799123847, + "language_loss": 0.840487, + "learning_rate": 0.00033404155218887897, + "loss": 0.85121012, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.27880859, + "step": 3220, + "time_per_iteration": 2.7182207107543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04733968, + "epoch": 0.6196614082339361, + "flos": 503963638272.0, + "grad_norm": 0.08803961295836986, + "language_loss": 0.87216806, + "learning_rate": 0.00033374770387320534, + "loss": 0.88291949, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.27856445, + "step": 3221, + "time_per_iteration": 2.7941346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078924, + "balance_loss_mlp": 1.05095768, + "epoch": 0.6198537899191997, + "flos": 575131249152.0, + "grad_norm": 0.055815039151530264, + "language_loss": 0.84867358, + "learning_rate": 0.00033345392010239737, + "loss": 0.8594628, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.27978516, + "step": 3222, + "time_per_iteration": 2.710803747177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082482, + "balance_loss_mlp": 1.05432487, + "epoch": 0.6200461716044633, + "flos": 592871851008.0, + "grad_norm": 0.05804972472550271, + "language_loss": 0.82259816, + "learning_rate": 0.0003331602009905118, + "loss": 0.83342302, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.28198242, + "step": 3223, + "time_per_iteration": 2.8335556983947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081003, + "balance_loss_mlp": 1.052917, + "epoch": 0.6202385532897268, + "flos": 665765001216.0, + "grad_norm": 0.05452675895151675, + "language_loss": 0.83620667, + "learning_rate": 0.00033286654665158085, + "loss": 0.84701669, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.28100586, + "step": 3224, + "time_per_iteration": 2.929290533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077713, + "balance_loss_mlp": 1.05038977, + "epoch": 0.6204309349749904, + "flos": 484709902848.0, + "grad_norm": 0.05879630449885449, + "language_loss": 0.87538344, + "learning_rate": 0.0003325729571996109, + "loss": 0.88616055, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.27368164, + "step": 3225, + "time_per_iteration": 2.6219499111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_mlp": 1.04980159, + "epoch": 0.6206233166602539, + "flos": 583768014336.0, + "grad_norm": 0.06449737595715416, + "language_loss": 0.83818585, + "learning_rate": 0.000332279432748584, + "loss": 0.84897381, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.28955078, + "step": 3226, + "time_per_iteration": 2.7298083305358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082841, + "balance_loss_mlp": 1.054636, + "epoch": 0.6208156983455175, + "flos": 476669657088.0, + "grad_norm": 0.05904408165059124, + "language_loss": 0.87270737, + "learning_rate": 0.00033198597341245576, + "loss": 0.88353574, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.28222656, + "step": 3227, + "time_per_iteration": 2.5691256523132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108032, + "balance_loss_mlp": 1.05151939, + "epoch": 0.6210080800307811, + "flos": 788716887552.0, + "grad_norm": 0.053113519370634896, + "language_loss": 0.81682974, + "learning_rate": 0.00033169257930515763, + "loss": 0.82763296, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.2878418, + "step": 3228, + "time_per_iteration": 3.0353121757507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084583, + "balance_loss_mlp": 1.05587709, + "epoch": 0.6212004617160446, + "flos": 607514812416.0, + "grad_norm": 0.059839903219207714, + "language_loss": 0.82242584, + "learning_rate": 0.0003313992505405951, + "loss": 0.83327174, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.28686523, + "step": 3229, + "time_per_iteration": 2.720705270767212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075139, + "balance_loss_mlp": 1.04743469, + "epoch": 0.6213928434013082, + "flos": 586248786432.0, + "grad_norm": 0.0642388463301134, + "language_loss": 0.80858111, + "learning_rate": 0.0003311059872326487, + "loss": 0.81933248, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.27709961, + "step": 3230, + "time_per_iteration": 2.6720995903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081662, + "balance_loss_mlp": 1.05352879, + "epoch": 0.6215852250865718, + "flos": 535819083264.0, + "grad_norm": 0.049445896607163295, + "language_loss": 0.78987181, + "learning_rate": 0.0003308127894951734, + "loss": 0.80068845, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.28149414, + "step": 3231, + "time_per_iteration": 2.63030743598938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107994, + "balance_loss_mlp": 1.05214071, + "epoch": 0.6217776067718354, + "flos": 617884471296.0, + "grad_norm": 0.07248200651444572, + "language_loss": 0.86507577, + "learning_rate": 0.00033051965744199834, + "loss": 0.87587512, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.27832031, + "step": 3232, + "time_per_iteration": 2.7564406394958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081036, + "balance_loss_mlp": 1.05302238, + "epoch": 0.6219699884570988, + "flos": 545570311680.0, + "grad_norm": 0.05351658478199456, + "language_loss": 0.90184295, + "learning_rate": 0.0003302265911869276, + "loss": 0.91265333, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.28051758, + "step": 3233, + "time_per_iteration": 2.9271633625030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081705, + "balance_loss_mlp": 1.05373812, + "epoch": 0.6221623701423624, + "flos": 480899289600.0, + "grad_norm": 0.056002159029406404, + "language_loss": 0.84084082, + "learning_rate": 0.0003299335908437397, + "loss": 0.85165787, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.2800293, + "step": 3234, + "time_per_iteration": 2.5909643173217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080844, + "balance_loss_mlp": 1.05228114, + "epoch": 0.622354751827626, + "flos": 379812920832.0, + "grad_norm": 0.06942928938800572, + "language_loss": 0.79645211, + "learning_rate": 0.0003296406565261873, + "loss": 0.80726051, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.28564453, + "step": 3235, + "time_per_iteration": 2.5319809913635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107822, + "balance_loss_mlp": 1.04927599, + "epoch": 0.6225471335128896, + "flos": 667570678272.0, + "grad_norm": 0.04882824212942084, + "language_loss": 0.8475616, + "learning_rate": 0.0003293477883479978, + "loss": 0.85834384, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.28955078, + "step": 3236, + "time_per_iteration": 2.8348751068115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079457, + "balance_loss_mlp": 1.05110943, + "epoch": 0.6227395151981532, + "flos": 770995224576.0, + "grad_norm": 0.06517457110491971, + "language_loss": 0.79784298, + "learning_rate": 0.0003290549864228727, + "loss": 0.80863756, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.28369141, + "step": 3237, + "time_per_iteration": 2.9205360412597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078288, + "balance_loss_mlp": 1.04934406, + "epoch": 0.6229318968834167, + "flos": 484104619008.0, + "grad_norm": 0.05190818630751583, + "language_loss": 0.86413801, + "learning_rate": 0.0003287622508644875, + "loss": 0.8749209, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.28930664, + "step": 3238, + "time_per_iteration": 2.7504210472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_mlp": 1.04736114, + "epoch": 0.6231242785686802, + "flos": 462700380672.0, + "grad_norm": 0.06410601543922713, + "language_loss": 0.8596704, + "learning_rate": 0.0003284695817864923, + "loss": 0.8704325, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.28808594, + "step": 3239, + "time_per_iteration": 2.487185001373291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082288, + "balance_loss_mlp": 1.0541544, + "epoch": 0.6233166602539438, + "flos": 608809747968.0, + "grad_norm": 0.07028564715864687, + "language_loss": 0.83921337, + "learning_rate": 0.0003281769793025116, + "loss": 0.85003626, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.28149414, + "step": 3240, + "time_per_iteration": 2.7399847507476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107903, + "balance_loss_mlp": 1.05106378, + "epoch": 0.6235090419392074, + "flos": 438972521472.0, + "grad_norm": 0.06749958965512537, + "language_loss": 0.89295518, + "learning_rate": 0.00032788444352614346, + "loss": 0.90374541, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.27978516, + "step": 3241, + "time_per_iteration": 2.550497531890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.05055451, + "epoch": 0.6237014236244709, + "flos": 504656262144.0, + "grad_norm": 0.05896628136636162, + "language_loss": 0.80561244, + "learning_rate": 0.0003275919745709606, + "loss": 0.81640697, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.28881836, + "step": 3242, + "time_per_iteration": 2.5805697441101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107483, + "balance_loss_mlp": 1.0460763, + "epoch": 0.6238938053097345, + "flos": 512648455680.0, + "grad_norm": 0.058276556279693525, + "language_loss": 0.8216207, + "learning_rate": 0.00032729957255050936, + "loss": 0.83236909, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.28759766, + "step": 3243, + "time_per_iteration": 2.6520867347717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075457, + "balance_loss_mlp": 1.0462271, + "epoch": 0.6240861869949981, + "flos": 736435017216.0, + "grad_norm": 0.0677841364318074, + "language_loss": 0.81232285, + "learning_rate": 0.0003270072375783102, + "loss": 0.82307744, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.29174805, + "step": 3244, + "time_per_iteration": 2.8922722339630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079597, + "balance_loss_mlp": 1.05098701, + "epoch": 0.6242785686802617, + "flos": 494464103424.0, + "grad_norm": 0.055818323982708785, + "language_loss": 0.7931875, + "learning_rate": 0.00032671496976785774, + "loss": 0.80398345, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.28613281, + "step": 3245, + "time_per_iteration": 2.6470372676849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04359281, + "epoch": 0.6244709503655252, + "flos": 745500976128.0, + "grad_norm": 0.04960718098470409, + "language_loss": 0.75533414, + "learning_rate": 0.0003264227692326205, + "loss": 0.76605284, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.28295898, + "step": 3246, + "time_per_iteration": 3.0302975177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079718, + "balance_loss_mlp": 1.05010653, + "epoch": 0.6246633320507887, + "flos": 492366034944.0, + "grad_norm": 0.054579168692914876, + "language_loss": 0.85738158, + "learning_rate": 0.00032613063608604055, + "loss": 0.86817873, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.29589844, + "step": 3247, + "time_per_iteration": 2.529571771621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_mlp": 1.05147064, + "epoch": 0.6248557137360523, + "flos": 517142928384.0, + "grad_norm": 0.054889772992989326, + "language_loss": 0.8363654, + "learning_rate": 0.0003258385704415343, + "loss": 0.84716547, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.28540039, + "step": 3248, + "time_per_iteration": 2.590259313583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076974, + "balance_loss_mlp": 1.04745758, + "epoch": 0.6250480954213159, + "flos": 519098402304.0, + "grad_norm": 0.0554200225727057, + "language_loss": 0.82566541, + "learning_rate": 0.0003255465724124915, + "loss": 0.8364352, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.29492188, + "step": 3249, + "time_per_iteration": 2.6928865909576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079488, + "balance_loss_mlp": 1.05044842, + "epoch": 0.6252404771065795, + "flos": 515808705024.0, + "grad_norm": 0.051820175568143126, + "language_loss": 0.82984078, + "learning_rate": 0.00032525464211227587, + "loss": 0.84063572, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.2902832, + "step": 3250, + "time_per_iteration": 2.5911831855773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080369, + "balance_loss_mlp": 1.0519259, + "epoch": 0.6254328587918431, + "flos": 576647354880.0, + "grad_norm": 0.05767056492483943, + "language_loss": 0.85669184, + "learning_rate": 0.0003249627796542249, + "loss": 0.86749554, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.28442383, + "step": 3251, + "time_per_iteration": 2.6558287143707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_mlp": 1.04481697, + "epoch": 0.6256252404771065, + "flos": 597638366208.0, + "grad_norm": 0.0553994194583659, + "language_loss": 0.84238529, + "learning_rate": 0.00032467098515164943, + "loss": 0.85312456, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.29077148, + "step": 3252, + "time_per_iteration": 2.8710081577301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010798, + "balance_loss_mlp": 1.04992628, + "epoch": 0.6258176221623701, + "flos": 508034709504.0, + "grad_norm": 0.0724295756751151, + "language_loss": 0.83990276, + "learning_rate": 0.00032437925871783456, + "loss": 0.85070074, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.2980957, + "step": 3253, + "time_per_iteration": 2.680757761001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077647, + "balance_loss_mlp": 1.04824996, + "epoch": 0.6260100038476337, + "flos": 639357110784.0, + "grad_norm": 0.06297548912406484, + "language_loss": 0.84215987, + "learning_rate": 0.00032408760046603803, + "loss": 0.85293639, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.29370117, + "step": 3254, + "time_per_iteration": 2.8605175018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071718, + "balance_loss_mlp": 1.04308379, + "epoch": 0.6262023855328973, + "flos": 840648139776.0, + "grad_norm": 0.06707664571923276, + "language_loss": 0.77650177, + "learning_rate": 0.00032379601050949193, + "loss": 0.78721887, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.28613281, + "step": 3255, + "time_per_iteration": 3.0878231525421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107032, + "balance_loss_mlp": 1.04125643, + "epoch": 0.6263947672181608, + "flos": 521884712448.0, + "grad_norm": 0.055802614278498724, + "language_loss": 0.8790136, + "learning_rate": 0.0003235044889614013, + "loss": 0.8897168, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.29052734, + "step": 3256, + "time_per_iteration": 2.5939788818359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072592, + "balance_loss_mlp": 1.04302788, + "epoch": 0.6265871489034244, + "flos": 606747995136.0, + "grad_norm": 0.05515134857427489, + "language_loss": 0.83577603, + "learning_rate": 0.0003232130359349451, + "loss": 0.84650195, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.29541016, + "step": 3257, + "time_per_iteration": 2.8894662857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_mlp": 1.03752887, + "epoch": 0.626779530588688, + "flos": 588208642560.0, + "grad_norm": 0.05130373708668117, + "language_loss": 0.81576669, + "learning_rate": 0.0003229216515432751, + "loss": 0.82644784, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.30566406, + "step": 3258, + "time_per_iteration": 2.756706476211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076382, + "balance_loss_mlp": 1.04586434, + "epoch": 0.6269719122739515, + "flos": 438381794304.0, + "grad_norm": 0.06660247735864482, + "language_loss": 0.79725903, + "learning_rate": 0.0003226303358995174, + "loss": 0.80802286, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.3046875, + "step": 3259, + "time_per_iteration": 2.67144775390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077975, + "balance_loss_mlp": 1.04760051, + "epoch": 0.6271642939592151, + "flos": 562590738432.0, + "grad_norm": 0.05404958184745656, + "language_loss": 0.88993442, + "learning_rate": 0.00032233908911677, + "loss": 0.90071416, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.30322266, + "step": 3260, + "time_per_iteration": 2.863938808441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073635, + "balance_loss_mlp": 1.0435462, + "epoch": 0.6273566756444786, + "flos": 514288217088.0, + "grad_norm": 0.053449532753106085, + "language_loss": 0.80614489, + "learning_rate": 0.0003220479113081053, + "loss": 0.81688124, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.30053711, + "step": 3261, + "time_per_iteration": 2.7604382038116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106913, + "balance_loss_mlp": 1.03846908, + "epoch": 0.6275490573297422, + "flos": 585195369984.0, + "grad_norm": 0.08212493062436176, + "language_loss": 0.78586102, + "learning_rate": 0.00032175680258656836, + "loss": 0.7965523, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.30615234, + "step": 3262, + "time_per_iteration": 2.6967196464538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071974, + "balance_loss_mlp": 1.04190898, + "epoch": 0.6277414390150058, + "flos": 559143889920.0, + "grad_norm": 0.05356215085141381, + "language_loss": 0.79812634, + "learning_rate": 0.00032146576306517794, + "loss": 0.80884606, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.30029297, + "step": 3263, + "time_per_iteration": 2.8093175888061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078395, + "balance_loss_mlp": 1.04866421, + "epoch": 0.6279338207002694, + "flos": 612423922176.0, + "grad_norm": 0.0554541143403023, + "language_loss": 0.80460787, + "learning_rate": 0.0003211747928569255, + "loss": 0.81539178, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.296875, + "step": 3264, + "time_per_iteration": 2.760589122772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076242, + "balance_loss_mlp": 1.04741764, + "epoch": 0.6281262023855329, + "flos": 625374687744.0, + "grad_norm": 0.05014640017162604, + "language_loss": 0.81306803, + "learning_rate": 0.0003208838920747754, + "loss": 0.82383049, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.28833008, + "step": 3265, + "time_per_iteration": 2.8798112869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072039, + "balance_loss_mlp": 1.04342878, + "epoch": 0.6283185840707964, + "flos": 1123147579392.0, + "grad_norm": 0.0653184175681376, + "language_loss": 0.7620573, + "learning_rate": 0.0003205930608316656, + "loss": 0.77277768, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.28588867, + "step": 3266, + "time_per_iteration": 3.571838140487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.03900564, + "epoch": 0.62851096575606, + "flos": 514967694336.0, + "grad_norm": 0.0645756575705021, + "language_loss": 0.84763867, + "learning_rate": 0.00032030229924050673, + "loss": 0.85831082, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.2824707, + "step": 3267, + "time_per_iteration": 2.6483044624328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076916, + "balance_loss_mlp": 1.04732847, + "epoch": 0.6287033474413236, + "flos": 403949624832.0, + "grad_norm": 0.056929311189361634, + "language_loss": 0.79781055, + "learning_rate": 0.00032001160741418247, + "loss": 0.8085798, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.2956543, + "step": 3268, + "time_per_iteration": 2.6264944076538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04559875, + "epoch": 0.6288957291265872, + "flos": 525459598848.0, + "grad_norm": 0.06099991776651708, + "language_loss": 0.82100242, + "learning_rate": 0.0003197209854655494, + "loss": 0.83175737, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.29833984, + "step": 3269, + "time_per_iteration": 2.704279661178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073063, + "balance_loss_mlp": 1.04454803, + "epoch": 0.6290881108118507, + "flos": 603414627840.0, + "grad_norm": 0.06377784920568129, + "language_loss": 0.74516416, + "learning_rate": 0.0003194304335074371, + "loss": 0.75589478, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.28515625, + "step": 3270, + "time_per_iteration": 2.82635235786438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072113, + "balance_loss_mlp": 1.04281116, + "epoch": 0.6292804924971143, + "flos": 437446241280.0, + "grad_norm": 0.054968431789037576, + "language_loss": 0.88535178, + "learning_rate": 0.0003191399516526475, + "loss": 0.89607286, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.29272461, + "step": 3271, + "time_per_iteration": 2.4927825927734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074321, + "balance_loss_mlp": 1.04575849, + "epoch": 0.6294728741823779, + "flos": 606368263680.0, + "grad_norm": 0.05221826851343204, + "language_loss": 0.79470003, + "learning_rate": 0.0003188495400139559, + "loss": 0.80544329, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.28540039, + "step": 3272, + "time_per_iteration": 2.764953851699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071949, + "balance_loss_mlp": 1.04312468, + "epoch": 0.6296652558676414, + "flos": 701220063744.0, + "grad_norm": 0.060799032420417454, + "language_loss": 0.84558678, + "learning_rate": 0.00031855919870411013, + "loss": 0.85630625, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.28808594, + "step": 3273, + "time_per_iteration": 2.823537588119507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071632, + "balance_loss_mlp": 1.04213953, + "epoch": 0.6298576375529049, + "flos": 523652511744.0, + "grad_norm": 0.05430009118151755, + "language_loss": 0.84791374, + "learning_rate": 0.0003182689278358305, + "loss": 0.85863006, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.29443359, + "step": 3274, + "time_per_iteration": 2.6649551391601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073347, + "balance_loss_mlp": 1.04416466, + "epoch": 0.6300500192381685, + "flos": 475723929600.0, + "grad_norm": 0.085227141064307, + "language_loss": 0.79910004, + "learning_rate": 0.0003179787275218105, + "loss": 0.80983347, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.29174805, + "step": 3275, + "time_per_iteration": 2.563103437423706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074447, + "balance_loss_mlp": 1.0460037, + "epoch": 0.6302424009234321, + "flos": 520629064704.0, + "grad_norm": 0.07197275527111574, + "language_loss": 0.84121722, + "learning_rate": 0.0003176885978747155, + "loss": 0.85196167, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.28466797, + "step": 3276, + "time_per_iteration": 2.634556293487549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076833, + "balance_loss_mlp": 1.04807937, + "epoch": 0.6304347826086957, + "flos": 694282696704.0, + "grad_norm": 0.05534578709936448, + "language_loss": 0.82750475, + "learning_rate": 0.0003173985390071839, + "loss": 0.83827305, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.28735352, + "step": 3277, + "time_per_iteration": 2.8998594284057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018796, + "balance_loss_mlp": 1.0069232, + "epoch": 0.6306271642939593, + "flos": 1466067755520.0, + "grad_norm": 0.01138839518784329, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78919256, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.11865234, + "step": 3278, + "time_per_iteration": 4.791780233383179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076998, + "balance_loss_mlp": 1.04678988, + "epoch": 0.6308195459792227, + "flos": 601444597248.0, + "grad_norm": 0.07347882473000023, + "language_loss": 0.81146979, + "learning_rate": 0.00031681863406122704, + "loss": 0.82223976, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.30151367, + "step": 3279, + "time_per_iteration": 2.7681593894958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077607, + "balance_loss_mlp": 1.0484724, + "epoch": 0.6310119276644863, + "flos": 726514900992.0, + "grad_norm": 0.0604928742924753, + "language_loss": 0.85127562, + "learning_rate": 0.00031652878820794087, + "loss": 0.86205173, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.29101562, + "step": 3280, + "time_per_iteration": 2.9940550327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_mlp": 1.04970694, + "epoch": 0.6312043093497499, + "flos": 519482515968.0, + "grad_norm": 0.06373938844251871, + "language_loss": 0.85768282, + "learning_rate": 0.00031623901358449627, + "loss": 0.86847264, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.29223633, + "step": 3281, + "time_per_iteration": 2.637016773223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080918, + "balance_loss_mlp": 1.05206895, + "epoch": 0.6313966910350135, + "flos": 530934704640.0, + "grad_norm": 0.0651224667912018, + "language_loss": 0.88407606, + "learning_rate": 0.0003159493103033936, + "loss": 0.89488524, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.28857422, + "step": 3282, + "time_per_iteration": 2.6074159145355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_mlp": 1.0136919, + "epoch": 0.631589072720277, + "flos": 1379113606656.0, + "grad_norm": 0.014583316572648261, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80944717, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.11962891, + "step": 3283, + "time_per_iteration": 4.897862195968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081353, + "balance_loss_mlp": 1.05183721, + "epoch": 0.6317814544055406, + "flos": 624379497984.0, + "grad_norm": 0.07926250214207341, + "language_loss": 0.82117367, + "learning_rate": 0.0003153701182180776, + "loss": 0.83198726, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.29443359, + "step": 3284, + "time_per_iteration": 2.773768186569214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085168, + "balance_loss_mlp": 1.05653346, + "epoch": 0.6319738360908042, + "flos": 497876046336.0, + "grad_norm": 0.06299610541065176, + "language_loss": 0.81832671, + "learning_rate": 0.00031508062963872655, + "loss": 0.82917833, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.28613281, + "step": 3285, + "time_per_iteration": 2.5745344161987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083768, + "balance_loss_mlp": 1.05484831, + "epoch": 0.6321662177760677, + "flos": 579474362880.0, + "grad_norm": 0.0675003916655452, + "language_loss": 0.7940349, + "learning_rate": 0.0003147912128514423, + "loss": 0.80487257, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.2890625, + "step": 3286, + "time_per_iteration": 2.736119508743286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088711, + "balance_loss_mlp": 1.05976713, + "epoch": 0.6323585994613313, + "flos": 601207460352.0, + "grad_norm": 0.055334521213686955, + "language_loss": 0.87346876, + "learning_rate": 0.0003145018679685859, + "loss": 0.88435584, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.28881836, + "step": 3287, + "time_per_iteration": 2.747880697250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083794, + "balance_loss_mlp": 1.05515993, + "epoch": 0.6325509811465948, + "flos": 528261875712.0, + "grad_norm": 0.049981399044418943, + "language_loss": 0.8773675, + "learning_rate": 0.00031421259510249134, + "loss": 0.88820541, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.28637695, + "step": 3288, + "time_per_iteration": 2.828601121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087286, + "balance_loss_mlp": 1.05898595, + "epoch": 0.6327433628318584, + "flos": 573993464832.0, + "grad_norm": 0.05983667283250032, + "language_loss": 0.81054246, + "learning_rate": 0.00031392339436546414, + "loss": 0.82141531, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.28295898, + "step": 3289, + "time_per_iteration": 2.8950355052948 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05599856, + "epoch": 0.632935744517122, + "flos": 516833008128.0, + "grad_norm": 0.08046321176630551, + "language_loss": 0.83522916, + "learning_rate": 0.00031363426586978205, + "loss": 0.84606409, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.27539062, + "step": 3290, + "time_per_iteration": 2.842975378036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079426, + "balance_loss_mlp": 1.05234218, + "epoch": 0.6331281262023856, + "flos": 617180262912.0, + "grad_norm": 0.06320614545402135, + "language_loss": 0.84556788, + "learning_rate": 0.0003133452097276947, + "loss": 0.85636216, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.27148438, + "step": 3291, + "time_per_iteration": 2.7399022579193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079638, + "balance_loss_mlp": 1.05174291, + "epoch": 0.633320507887649, + "flos": 592665237504.0, + "grad_norm": 0.05133484594344534, + "language_loss": 0.83828831, + "learning_rate": 0.0003130562260514238, + "loss": 0.84908473, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.27929688, + "step": 3292, + "time_per_iteration": 2.782712936401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074308, + "balance_loss_mlp": 1.04538822, + "epoch": 0.6335128895729126, + "flos": 582064233984.0, + "grad_norm": 0.05681875015952551, + "language_loss": 0.81639814, + "learning_rate": 0.0003127673149531626, + "loss": 0.82714117, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.28881836, + "step": 3293, + "time_per_iteration": 2.8035476207733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072918, + "balance_loss_mlp": 1.04454613, + "epoch": 0.6337052712581762, + "flos": 452803585536.0, + "grad_norm": 0.24840448660881664, + "language_loss": 0.82970059, + "learning_rate": 0.0003124784765450762, + "loss": 0.84042978, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.28393555, + "step": 3294, + "time_per_iteration": 2.608938694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077527, + "balance_loss_mlp": 1.04877377, + "epoch": 0.6338976529434398, + "flos": 573132105216.0, + "grad_norm": 0.05797118879251517, + "language_loss": 0.80332613, + "learning_rate": 0.0003121897109393017, + "loss": 0.81410146, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.28759766, + "step": 3295, + "time_per_iteration": 2.806485414505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075453, + "balance_loss_mlp": 1.04710555, + "epoch": 0.6340900346287034, + "flos": 508497398784.0, + "grad_norm": 0.05731717325491985, + "language_loss": 0.89463425, + "learning_rate": 0.0003119010182479481, + "loss": 0.90538877, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.28344727, + "step": 3296, + "time_per_iteration": 2.6082053184509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072414, + "balance_loss_mlp": 1.04430485, + "epoch": 0.6342824163139669, + "flos": 479505429504.0, + "grad_norm": 0.05711828874106615, + "language_loss": 0.82742012, + "learning_rate": 0.00031161239858309563, + "loss": 0.8381443, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.28149414, + "step": 3297, + "time_per_iteration": 2.563567638397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076965, + "balance_loss_mlp": 1.04818797, + "epoch": 0.6344747979992305, + "flos": 571762976256.0, + "grad_norm": 0.06150807271743663, + "language_loss": 0.8330332, + "learning_rate": 0.0003113238520567964, + "loss": 0.84380281, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.28759766, + "step": 3298, + "time_per_iteration": 2.6396591663360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075264, + "balance_loss_mlp": 1.04760718, + "epoch": 0.634667179684494, + "flos": 605629149696.0, + "grad_norm": 0.06211731206435071, + "language_loss": 0.81525218, + "learning_rate": 0.00031103537878107403, + "loss": 0.8260048, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.27709961, + "step": 3299, + "time_per_iteration": 2.7182040214538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076081, + "balance_loss_mlp": 1.04813862, + "epoch": 0.6348595613697576, + "flos": 646649478144.0, + "grad_norm": 0.09008856802474977, + "language_loss": 0.80391061, + "learning_rate": 0.0003107469788679238, + "loss": 0.81467146, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.27978516, + "step": 3300, + "time_per_iteration": 2.7851805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075354, + "balance_loss_mlp": 1.04688656, + "epoch": 0.6350519430550212, + "flos": 638776558080.0, + "grad_norm": 0.05422740840370266, + "language_loss": 0.86501485, + "learning_rate": 0.00031045865242931267, + "loss": 0.87576842, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.28466797, + "step": 3301, + "time_per_iteration": 2.810676097869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075733, + "balance_loss_mlp": 1.04755139, + "epoch": 0.6352443247402847, + "flos": 686091091968.0, + "grad_norm": 0.05423287831049679, + "language_loss": 0.82804501, + "learning_rate": 0.00031017039957717877, + "loss": 0.83880234, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.28149414, + "step": 3302, + "time_per_iteration": 3.0281054973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_mlp": 1.0450151, + "epoch": 0.6354367064255483, + "flos": 559173003264.0, + "grad_norm": 0.05349883160058106, + "language_loss": 0.88460255, + "learning_rate": 0.0003098822204234318, + "loss": 0.89534903, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.29589844, + "step": 3303, + "time_per_iteration": 2.666997194290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075345, + "balance_loss_mlp": 1.04713964, + "epoch": 0.6356290881108119, + "flos": 979095582720.0, + "grad_norm": 0.06555082687836872, + "language_loss": 0.87261242, + "learning_rate": 0.00030959411507995273, + "loss": 0.88336587, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.2824707, + "step": 3304, + "time_per_iteration": 3.197598457336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_mlp": 1.04662395, + "epoch": 0.6358214697960755, + "flos": 528005799936.0, + "grad_norm": 0.0641703169727953, + "language_loss": 0.81063581, + "learning_rate": 0.00030930608365859407, + "loss": 0.82138741, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.28540039, + "step": 3305, + "time_per_iteration": 2.6621649265289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074885, + "balance_loss_mlp": 1.04713345, + "epoch": 0.6360138514813389, + "flos": 516547819008.0, + "grad_norm": 0.049948399084256474, + "language_loss": 0.87610269, + "learning_rate": 0.00030901812627117943, + "loss": 0.88685155, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.27783203, + "step": 3306, + "time_per_iteration": 2.612919807434082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077235, + "balance_loss_mlp": 1.04826725, + "epoch": 0.6362062331666025, + "flos": 466289823744.0, + "grad_norm": 0.06317558416619916, + "language_loss": 0.84607321, + "learning_rate": 0.000308730243029504, + "loss": 0.85684562, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.28955078, + "step": 3307, + "time_per_iteration": 2.5705294609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072567, + "balance_loss_mlp": 1.04307485, + "epoch": 0.6363986148518661, + "flos": 549458090496.0, + "grad_norm": 0.05685632301598214, + "language_loss": 0.79783237, + "learning_rate": 0.0003084424340453339, + "loss": 0.80855805, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.29443359, + "step": 3308, + "time_per_iteration": 2.807271957397461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04784167, + "epoch": 0.6365909965371297, + "flos": 582772824576.0, + "grad_norm": 0.05758668896734757, + "language_loss": 0.81629676, + "learning_rate": 0.0003081546994304064, + "loss": 0.82706171, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.28637695, + "step": 3309, + "time_per_iteration": 2.7554562091827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076484, + "balance_loss_mlp": 1.04794574, + "epoch": 0.6367833782223933, + "flos": 530998723584.0, + "grad_norm": 0.06449450681570038, + "language_loss": 0.81813806, + "learning_rate": 0.0003078670392964298, + "loss": 0.82890296, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.28540039, + "step": 3310, + "time_per_iteration": 2.5969130992889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075995, + "balance_loss_mlp": 1.04721737, + "epoch": 0.6369757599076568, + "flos": 569237124096.0, + "grad_norm": 0.05473972875900602, + "language_loss": 0.82840186, + "learning_rate": 0.00030757945375508406, + "loss": 0.83916187, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.28759766, + "step": 3311, + "time_per_iteration": 2.663797616958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04507077, + "epoch": 0.6371681415929203, + "flos": 539684951040.0, + "grad_norm": 0.0598003061946429, + "language_loss": 0.8103205, + "learning_rate": 0.00030729194291801944, + "loss": 0.82106709, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.2956543, + "step": 3312, + "time_per_iteration": 2.6541266441345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070179, + "balance_loss_mlp": 1.04099667, + "epoch": 0.6373605232781839, + "flos": 483326217216.0, + "grad_norm": 0.06742420261969287, + "language_loss": 0.77177984, + "learning_rate": 0.00030700450689685787, + "loss": 0.78248155, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.29174805, + "step": 3313, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071745, + "balance_loss_mlp": 1.0428009, + "epoch": 0.6375529049634475, + "flos": 578273969664.0, + "grad_norm": 0.04829069116986981, + "language_loss": 0.85252231, + "learning_rate": 0.00030671714580319186, + "loss": 0.86323977, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.28930664, + "step": 3314, + "time_per_iteration": 2.840120553970337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075513, + "balance_loss_mlp": 1.04618776, + "epoch": 0.637745286648711, + "flos": 681953181696.0, + "grad_norm": 0.06110269335032462, + "language_loss": 0.83013022, + "learning_rate": 0.0003064298597485846, + "loss": 0.84088534, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.29296875, + "step": 3315, + "time_per_iteration": 2.852611541748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_mlp": 1.04463601, + "epoch": 0.6379376683339746, + "flos": 504385629696.0, + "grad_norm": 0.058531862616109036, + "language_loss": 0.83941239, + "learning_rate": 0.00030614264884457054, + "loss": 0.85014582, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.28710938, + "step": 3316, + "time_per_iteration": 2.636786699295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107333, + "balance_loss_mlp": 1.04429102, + "epoch": 0.6381300500192382, + "flos": 501771027456.0, + "grad_norm": 0.06311790142040714, + "language_loss": 0.7747215, + "learning_rate": 0.000305855513202655, + "loss": 0.78545475, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.2902832, + "step": 3317, + "time_per_iteration": 2.572878837585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073954, + "balance_loss_mlp": 1.04491496, + "epoch": 0.6383224317045018, + "flos": 400271431680.0, + "grad_norm": 0.06648512772878035, + "language_loss": 0.77336514, + "learning_rate": 0.0003055684529343138, + "loss": 0.7841047, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.29052734, + "step": 3318, + "time_per_iteration": 2.4436564445495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072959, + "balance_loss_mlp": 1.04427767, + "epoch": 0.6385148133897653, + "flos": 499131694080.0, + "grad_norm": 0.17585576995025723, + "language_loss": 0.78666025, + "learning_rate": 0.00030528146815099374, + "loss": 0.79738986, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.28686523, + "step": 3319, + "time_per_iteration": 2.633169174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04463935, + "epoch": 0.6387071950750288, + "flos": 527409280512.0, + "grad_norm": 0.05914219973016666, + "language_loss": 0.72023094, + "learning_rate": 0.00030499455896411203, + "loss": 0.73096609, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.28881836, + "step": 3320, + "time_per_iteration": 2.6515796184539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064633, + "balance_loss_mlp": 1.05213952, + "epoch": 0.6388995767602924, + "flos": 1455200501760.0, + "grad_norm": 0.030989551650608328, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77365446, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.125, + "step": 3321, + "time_per_iteration": 4.949177980422974 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077103, + "balance_loss_mlp": 1.04768264, + "epoch": 0.639091958445556, + "flos": 603577571328.0, + "grad_norm": 0.05124764901012802, + "language_loss": 0.76538706, + "learning_rate": 0.0003044209678251865, + "loss": 0.77615809, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.29370117, + "step": 3322, + "time_per_iteration": 2.8691534996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082016, + "balance_loss_mlp": 1.05257154, + "epoch": 0.6392843401308196, + "flos": 584230703616.0, + "grad_norm": 0.052110264896392484, + "language_loss": 0.84702694, + "learning_rate": 0.0003041342860958306, + "loss": 0.85784709, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.29443359, + "step": 3323, + "time_per_iteration": 2.764293670654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080288, + "balance_loss_mlp": 1.0508672, + "epoch": 0.6394767218160831, + "flos": 514420637184.0, + "grad_norm": 0.06415760622420662, + "language_loss": 0.91791111, + "learning_rate": 0.00030384768040828857, + "loss": 0.92871398, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.29418945, + "step": 3324, + "time_per_iteration": 2.676239252090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083947, + "balance_loss_mlp": 1.05457401, + "epoch": 0.6396691035013466, + "flos": 541471689216.0, + "grad_norm": 0.06537046066409105, + "language_loss": 0.85248572, + "learning_rate": 0.00030356115087383094, + "loss": 0.86332518, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.29321289, + "step": 3325, + "time_per_iteration": 2.6422836780548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108456, + "balance_loss_mlp": 1.05523491, + "epoch": 0.6398614851866102, + "flos": 525282098688.0, + "grad_norm": 0.07261726527326764, + "language_loss": 0.85094643, + "learning_rate": 0.00030327469760369803, + "loss": 0.86179203, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.29345703, + "step": 3326, + "time_per_iteration": 2.618764877319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078424, + "balance_loss_mlp": 1.04943204, + "epoch": 0.6400538668718738, + "flos": 622704830976.0, + "grad_norm": 0.06406701351791282, + "language_loss": 0.85019833, + "learning_rate": 0.0003029883207091009, + "loss": 0.86098254, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.28979492, + "step": 3327, + "time_per_iteration": 2.699650764465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.04961252, + "epoch": 0.6402462485571374, + "flos": 503096486400.0, + "grad_norm": 0.0560194788269582, + "language_loss": 0.77876812, + "learning_rate": 0.00030270202030122095, + "loss": 0.78955448, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.29003906, + "step": 3328, + "time_per_iteration": 2.6756327152252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079179, + "balance_loss_mlp": 1.04994857, + "epoch": 0.6404386302424009, + "flos": 818894693376.0, + "grad_norm": 0.07533630521216038, + "language_loss": 0.86165637, + "learning_rate": 0.00030241579649121, + "loss": 0.87244821, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.29199219, + "step": 3329, + "time_per_iteration": 2.988523244857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081549, + "balance_loss_mlp": 1.05286741, + "epoch": 0.6406310119276645, + "flos": 471568490496.0, + "grad_norm": 0.06215732096136448, + "language_loss": 0.79335475, + "learning_rate": 0.00030212964939018994, + "loss": 0.80417025, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.28662109, + "step": 3330, + "time_per_iteration": 2.536287307739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079251, + "balance_loss_mlp": 1.05035472, + "epoch": 0.6408233936129281, + "flos": 425358245376.0, + "grad_norm": 0.05674161193515711, + "language_loss": 0.85566485, + "learning_rate": 0.0003018435791092527, + "loss": 0.86645734, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.28857422, + "step": 3331, + "time_per_iteration": 2.4944264888763428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079855, + "balance_loss_mlp": 1.05191207, + "epoch": 0.6410157752981916, + "flos": 549522109440.0, + "grad_norm": 0.05931339185061419, + "language_loss": 0.80892223, + "learning_rate": 0.00030155758575946083, + "loss": 0.81972075, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.27954102, + "step": 3332, + "time_per_iteration": 2.6625006198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077272, + "balance_loss_mlp": 1.04797006, + "epoch": 0.6412081569834551, + "flos": 475659910656.0, + "grad_norm": 0.054973078138002, + "language_loss": 0.83676195, + "learning_rate": 0.0003012716694518467, + "loss": 0.84753466, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.29272461, + "step": 3333, + "time_per_iteration": 2.5685575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077896, + "balance_loss_mlp": 1.04976213, + "epoch": 0.6414005386687187, + "flos": 540645235200.0, + "grad_norm": 0.06333005970855973, + "language_loss": 0.84833503, + "learning_rate": 0.000300985830297413, + "loss": 0.85911405, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.28149414, + "step": 3334, + "time_per_iteration": 2.7106077671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077366, + "balance_loss_mlp": 1.04875624, + "epoch": 0.6415929203539823, + "flos": 1040909073408.0, + "grad_norm": 0.05617575604142134, + "language_loss": 0.87391257, + "learning_rate": 0.00030070006840713205, + "loss": 0.88468629, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.28613281, + "step": 3335, + "time_per_iteration": 3.390854835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076603, + "balance_loss_mlp": 1.04868436, + "epoch": 0.6417853020392459, + "flos": 648028781568.0, + "grad_norm": 0.055765507063515254, + "language_loss": 0.73336351, + "learning_rate": 0.000300414383891947, + "loss": 0.74412954, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.27954102, + "step": 3336, + "time_per_iteration": 2.8184750080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074814, + "balance_loss_mlp": 1.04713416, + "epoch": 0.6419776837245095, + "flos": 500639035392.0, + "grad_norm": 0.04865343351033758, + "language_loss": 0.88524318, + "learning_rate": 0.00030012877686276973, + "loss": 0.89599127, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.27709961, + "step": 3337, + "time_per_iteration": 2.693716049194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_mlp": 1.04925418, + "epoch": 0.642170065409773, + "flos": 620331747840.0, + "grad_norm": 0.05071900601819844, + "language_loss": 0.8653757, + "learning_rate": 0.0002998432474304832, + "loss": 0.87615323, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.28540039, + "step": 3338, + "time_per_iteration": 2.785625696182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014062, + "balance_loss_mlp": 1.00228393, + "epoch": 0.6423624470950365, + "flos": 1422767476224.0, + "grad_norm": 0.008511369807607439, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.80251408, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.11767578, + "step": 3339, + "time_per_iteration": 4.914938688278198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072832, + "balance_loss_mlp": 1.04531896, + "epoch": 0.6425548287803001, + "flos": 562082969088.0, + "grad_norm": 0.04920072731588192, + "language_loss": 0.88676053, + "learning_rate": 0.00029927242179996107, + "loss": 0.89748889, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.27539062, + "step": 3340, + "time_per_iteration": 2.6910037994384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075049, + "balance_loss_mlp": 1.04691517, + "epoch": 0.6427472104655637, + "flos": 585151699968.0, + "grad_norm": 0.050397080981132346, + "language_loss": 0.83332348, + "learning_rate": 0.0002989871258233398, + "loss": 0.84407395, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.28149414, + "step": 3341, + "time_per_iteration": 2.7581868171691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082012, + "balance_loss_mlp": 1.05337822, + "epoch": 0.6429395921508272, + "flos": 404067488256.0, + "grad_norm": 0.07038127558443963, + "language_loss": 0.82547259, + "learning_rate": 0.0002987019078868373, + "loss": 0.83629274, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.28613281, + "step": 3342, + "time_per_iteration": 2.4203991889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075866, + "balance_loss_mlp": 1.04792297, + "epoch": 0.6431319738360908, + "flos": 548522537472.0, + "grad_norm": 0.05404588481803156, + "language_loss": 0.81465191, + "learning_rate": 0.00029841676810118484, + "loss": 0.8254106, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.27978516, + "step": 3343, + "time_per_iteration": 2.665461778640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_mlp": 1.04489374, + "epoch": 0.6433243555213544, + "flos": 793044034560.0, + "grad_norm": 0.05709994868865375, + "language_loss": 0.8727839, + "learning_rate": 0.0002981317065770839, + "loss": 0.88351655, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.28344727, + "step": 3344, + "time_per_iteration": 3.0409646034240723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074581, + "balance_loss_mlp": 1.04592359, + "epoch": 0.643516737206618, + "flos": 582762650112.0, + "grad_norm": 0.0669931178788996, + "language_loss": 0.80771047, + "learning_rate": 0.00029784672342520493, + "loss": 0.81845629, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.28662109, + "step": 3345, + "time_per_iteration": 2.69077730178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073767, + "balance_loss_mlp": 1.04541922, + "epoch": 0.6437091188918815, + "flos": 518501882880.0, + "grad_norm": 0.058634487951654345, + "language_loss": 0.83929563, + "learning_rate": 0.00029756181875618834, + "loss": 0.85003328, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.28369141, + "step": 3346, + "time_per_iteration": 2.5735673904418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107364, + "balance_loss_mlp": 1.04541159, + "epoch": 0.643901500577145, + "flos": 384736587264.0, + "grad_norm": 0.06920918115326812, + "language_loss": 0.83749354, + "learning_rate": 0.0002972769926806439, + "loss": 0.84823, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.28222656, + "step": 3347, + "time_per_iteration": 2.480320692062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071427, + "balance_loss_mlp": 1.04248285, + "epoch": 0.6440938822624086, + "flos": 483478986240.0, + "grad_norm": 0.05946244063191617, + "language_loss": 0.88425148, + "learning_rate": 0.0002969922453091508, + "loss": 0.89496571, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.28930664, + "step": 3348, + "time_per_iteration": 2.5937469005584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107372, + "balance_loss_mlp": 1.04441822, + "epoch": 0.6442862639476722, + "flos": 540178163712.0, + "grad_norm": 0.04841561291850138, + "language_loss": 0.84831715, + "learning_rate": 0.00029670757675225777, + "loss": 0.85905439, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.29248047, + "step": 3349, + "time_per_iteration": 2.7379231452941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076606, + "balance_loss_mlp": 1.04754305, + "epoch": 0.6444786456329358, + "flos": 526651227648.0, + "grad_norm": 0.058104314548796505, + "language_loss": 0.79157209, + "learning_rate": 0.0002964229871204831, + "loss": 0.80233824, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.2902832, + "step": 3350, + "time_per_iteration": 2.6757731437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076273, + "balance_loss_mlp": 1.04663801, + "epoch": 0.6446710273181993, + "flos": 697576776192.0, + "grad_norm": 0.06774074305303925, + "language_loss": 0.83398223, + "learning_rate": 0.00029613847652431403, + "loss": 0.84474498, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.29614258, + "step": 3351, + "time_per_iteration": 2.905512571334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072846, + "balance_loss_mlp": 1.04409289, + "epoch": 0.6448634090034628, + "flos": 624705384960.0, + "grad_norm": 0.05155589011440517, + "language_loss": 0.79040021, + "learning_rate": 0.0002958540450742078, + "loss": 0.80112863, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.28735352, + "step": 3352, + "time_per_iteration": 2.929170846939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070119, + "balance_loss_mlp": 1.04026914, + "epoch": 0.6450557906887264, + "flos": 600647256576.0, + "grad_norm": 0.05063101037277444, + "language_loss": 0.77325773, + "learning_rate": 0.0002955696928805901, + "loss": 0.78395891, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.2980957, + "step": 3353, + "time_per_iteration": 2.881626605987549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107236, + "balance_loss_mlp": 1.04229498, + "epoch": 0.64524817237399, + "flos": 645905981952.0, + "grad_norm": 0.059706275301968766, + "language_loss": 0.86282456, + "learning_rate": 0.0002952854200538563, + "loss": 0.87354815, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.30004883, + "step": 3354, + "time_per_iteration": 2.8391265869140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070707, + "balance_loss_mlp": 1.04047608, + "epoch": 0.6454405540592536, + "flos": 473173346304.0, + "grad_norm": 0.08701934847838336, + "language_loss": 0.81666923, + "learning_rate": 0.000295001226704371, + "loss": 0.82737631, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.30175781, + "step": 3355, + "time_per_iteration": 2.598177194595337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.0440042, + "epoch": 0.6456329357445171, + "flos": 611548005888.0, + "grad_norm": 0.06424201750770815, + "language_loss": 0.82413089, + "learning_rate": 0.00029471711294246783, + "loss": 0.83487391, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.30273438, + "step": 3356, + "time_per_iteration": 2.813361644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069796, + "balance_loss_mlp": 1.03880155, + "epoch": 0.6458253174297807, + "flos": 731373138432.0, + "grad_norm": 0.06119276712520419, + "language_loss": 0.82436061, + "learning_rate": 0.0002944330788784494, + "loss": 0.83505857, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.30957031, + "step": 3357, + "time_per_iteration": 2.8810949325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073631, + "balance_loss_mlp": 1.04399514, + "epoch": 0.6460176991150443, + "flos": 570129007104.0, + "grad_norm": 0.06225888545708514, + "language_loss": 0.84205008, + "learning_rate": 0.00029414912462258786, + "loss": 0.8527863, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.29614258, + "step": 3358, + "time_per_iteration": 2.827125310897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074894, + "balance_loss_mlp": 1.0442096, + "epoch": 0.6462100808003078, + "flos": 582890688000.0, + "grad_norm": 0.06476670861286221, + "language_loss": 0.81335187, + "learning_rate": 0.00029386525028512366, + "loss": 0.82410085, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.30664062, + "step": 3359, + "time_per_iteration": 2.750802993774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072661, + "balance_loss_mlp": 1.04195285, + "epoch": 0.6464024624855714, + "flos": 483647721984.0, + "grad_norm": 0.05574217129277394, + "language_loss": 0.86898518, + "learning_rate": 0.0002935814559762666, + "loss": 0.87971175, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.30664062, + "step": 3360, + "time_per_iteration": 2.778729200363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071986, + "balance_loss_mlp": 1.04125416, + "epoch": 0.6465948441708349, + "flos": 527508205056.0, + "grad_norm": 0.05463243527184519, + "language_loss": 0.79309767, + "learning_rate": 0.0002932977418061957, + "loss": 0.80381751, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.30712891, + "step": 3361, + "time_per_iteration": 2.636300563812256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072531, + "balance_loss_mlp": 1.04284823, + "epoch": 0.6467872258560985, + "flos": 669121689600.0, + "grad_norm": 0.06447019250914547, + "language_loss": 0.80627209, + "learning_rate": 0.00029301410788505833, + "loss": 0.81699741, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.29638672, + "step": 3362, + "time_per_iteration": 2.7907180786132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071004, + "balance_loss_mlp": 1.04127288, + "epoch": 0.6469796075413621, + "flos": 431867828736.0, + "grad_norm": 0.06442175719622328, + "language_loss": 0.81014264, + "learning_rate": 0.00029273055432297126, + "loss": 0.8208527, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.29711914, + "step": 3363, + "time_per_iteration": 2.5577244758605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068782, + "balance_loss_mlp": 1.03835917, + "epoch": 0.6471719892266257, + "flos": 803413693440.0, + "grad_norm": 0.055871885274250355, + "language_loss": 0.80490357, + "learning_rate": 0.00029244708123001917, + "loss": 0.81559139, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.30395508, + "step": 3364, + "time_per_iteration": 2.938917636871338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065549, + "balance_loss_mlp": 1.0347929, + "epoch": 0.6473643709118891, + "flos": 576923779584.0, + "grad_norm": 0.060913516619686706, + "language_loss": 0.84265661, + "learning_rate": 0.0002921636887162565, + "loss": 0.85331213, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.30737305, + "step": 3365, + "time_per_iteration": 2.7420175075531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067245, + "balance_loss_mlp": 1.03718054, + "epoch": 0.6475567525971527, + "flos": 761079490560.0, + "grad_norm": 0.07220364495800281, + "language_loss": 0.84047341, + "learning_rate": 0.00029188037689170595, + "loss": 0.85114586, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.30029297, + "step": 3366, + "time_per_iteration": 2.941958427429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070259, + "balance_loss_mlp": 1.04026556, + "epoch": 0.6477491342824163, + "flos": 842754972672.0, + "grad_norm": 0.0698232037755488, + "language_loss": 0.84047693, + "learning_rate": 0.0002915971458663586, + "loss": 0.85117948, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.29931641, + "step": 3367, + "time_per_iteration": 3.0588743686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064684, + "balance_loss_mlp": 1.03507257, + "epoch": 0.6479415159676799, + "flos": 884431457280.0, + "grad_norm": 0.048093531739852514, + "language_loss": 0.81804395, + "learning_rate": 0.00029131399575017494, + "loss": 0.82869077, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.2956543, + "step": 3368, + "time_per_iteration": 3.194119691848755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6481338976529435, + "flos": 615211642368.0, + "grad_norm": 0.05082024761534885, + "language_loss": 0.85855007, + "learning_rate": 0.0002910309266530836, + "loss": 0.86920446, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.29638672, + "step": 3369, + "time_per_iteration": 2.7995903491973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069305, + "balance_loss_mlp": 1.03943157, + "epoch": 0.648326279338207, + "flos": 509757428736.0, + "grad_norm": 0.06123820960940181, + "language_loss": 0.85307527, + "learning_rate": 0.0002907479386849814, + "loss": 0.86376828, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.2980957, + "step": 3370, + "time_per_iteration": 2.6561813354492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070145, + "balance_loss_mlp": 1.03969884, + "epoch": 0.6485186610234706, + "flos": 702157026816.0, + "grad_norm": 0.06023552594522319, + "language_loss": 0.8010959, + "learning_rate": 0.0002904650319557339, + "loss": 0.81179738, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.30395508, + "step": 3371, + "time_per_iteration": 3.0036118030548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069259, + "balance_loss_mlp": 1.03967094, + "epoch": 0.6487110427087341, + "flos": 560418476544.0, + "grad_norm": 0.06478850515629742, + "language_loss": 0.81106675, + "learning_rate": 0.0002901822065751758, + "loss": 0.82175934, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.29541016, + "step": 3372, + "time_per_iteration": 2.6287784576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072808, + "balance_loss_mlp": 1.0429343, + "epoch": 0.6489034243939977, + "flos": 679801268736.0, + "grad_norm": 0.0516174175681091, + "language_loss": 0.854002, + "learning_rate": 0.0002898994626531093, + "loss": 0.86473012, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.29833984, + "step": 3373, + "time_per_iteration": 2.84863543510437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071305, + "balance_loss_mlp": 1.04181266, + "epoch": 0.6490958060792612, + "flos": 474172918272.0, + "grad_norm": 0.07661916167941812, + "language_loss": 0.88111019, + "learning_rate": 0.00028961680029930526, + "loss": 0.89182317, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.29443359, + "step": 3374, + "time_per_iteration": 2.5185511112213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03965008, + "epoch": 0.6492881877645248, + "flos": 588563642880.0, + "grad_norm": 0.05286852382904046, + "language_loss": 0.76929349, + "learning_rate": 0.00028933421962350317, + "loss": 0.77998275, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.29248047, + "step": 3375, + "time_per_iteration": 2.7406935691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071172, + "balance_loss_mlp": 1.04020166, + "epoch": 0.6494805694497884, + "flos": 642139038720.0, + "grad_norm": 0.05602089532541189, + "language_loss": 0.84000719, + "learning_rate": 0.0002890517207354104, + "loss": 0.85071886, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.30932617, + "step": 3376, + "time_per_iteration": 2.8145668506622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072679, + "balance_loss_mlp": 1.04263854, + "epoch": 0.649672951135052, + "flos": 531550162944.0, + "grad_norm": 0.05675413090178792, + "language_loss": 0.81828344, + "learning_rate": 0.0002887693037447029, + "loss": 0.82901019, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.30004883, + "step": 3377, + "time_per_iteration": 2.6432199478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070436, + "balance_loss_mlp": 1.04082441, + "epoch": 0.6498653328203156, + "flos": 547124295168.0, + "grad_norm": 0.05935135112647285, + "language_loss": 0.82021838, + "learning_rate": 0.00028848696876102443, + "loss": 0.83092278, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.29541016, + "step": 3378, + "time_per_iteration": 2.6862215995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065633, + "balance_loss_mlp": 1.03473437, + "epoch": 0.650057714505579, + "flos": 461996172288.0, + "grad_norm": 0.06179409995476596, + "language_loss": 0.83523512, + "learning_rate": 0.00028820471589398723, + "loss": 0.84589148, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.30859375, + "step": 3379, + "time_per_iteration": 2.5718047618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070203, + "balance_loss_mlp": 1.03970945, + "epoch": 0.6502500961908426, + "flos": 509905815552.0, + "grad_norm": 0.06289552232740542, + "language_loss": 0.77402478, + "learning_rate": 0.00028792254525317196, + "loss": 0.78472686, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.30493164, + "step": 3380, + "time_per_iteration": 2.779308795928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071743, + "balance_loss_mlp": 1.0420599, + "epoch": 0.6504424778761062, + "flos": 579557320704.0, + "grad_norm": 0.05486106257478186, + "language_loss": 0.81240368, + "learning_rate": 0.00028764045694812645, + "loss": 0.82312119, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.29638672, + "step": 3381, + "time_per_iteration": 2.7430598735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010701, + "balance_loss_mlp": 1.03936744, + "epoch": 0.6506348595613698, + "flos": 519206091264.0, + "grad_norm": 0.061364553922665516, + "language_loss": 0.76195431, + "learning_rate": 0.0002873584510883671, + "loss": 0.77265531, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.30688477, + "step": 3382, + "time_per_iteration": 2.575998306274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071659, + "balance_loss_mlp": 1.04085565, + "epoch": 0.6508272412466333, + "flos": 510048410112.0, + "grad_norm": 0.0719487575879366, + "language_loss": 0.85928071, + "learning_rate": 0.0002870765277833788, + "loss": 0.86999726, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.30761719, + "step": 3383, + "time_per_iteration": 2.7900807857513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_mlp": 1.03790629, + "epoch": 0.6510196229318969, + "flos": 625329607680.0, + "grad_norm": 0.06613356509687102, + "language_loss": 0.80323064, + "learning_rate": 0.00028679468714261347, + "loss": 0.81392419, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.31445312, + "step": 3384, + "time_per_iteration": 2.7730093002319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04132867, + "epoch": 0.6512120046171604, + "flos": 474453725184.0, + "grad_norm": 0.06288254960309916, + "language_loss": 0.76734459, + "learning_rate": 0.0002865129292754918, + "loss": 0.77805495, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.29663086, + "step": 3385, + "time_per_iteration": 2.6205520629882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075067, + "balance_loss_mlp": 1.04500234, + "epoch": 0.651404386302424, + "flos": 551561951232.0, + "grad_norm": 0.05411679726730615, + "language_loss": 0.81513727, + "learning_rate": 0.00028623125429140105, + "loss": 0.82588792, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.30004883, + "step": 3386, + "time_per_iteration": 2.88822340965271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.03826463, + "epoch": 0.6515967679876876, + "flos": 523047227904.0, + "grad_norm": 0.05765553092239875, + "language_loss": 0.87005818, + "learning_rate": 0.00028594966229969785, + "loss": 0.88073337, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.29223633, + "step": 3387, + "time_per_iteration": 2.6889727115631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074347, + "balance_loss_mlp": 1.04413986, + "epoch": 0.6517891496729511, + "flos": 573590412288.0, + "grad_norm": 0.05935709634506938, + "language_loss": 0.81214345, + "learning_rate": 0.00028566815340970577, + "loss": 0.82288694, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.30151367, + "step": 3388, + "time_per_iteration": 2.7500782012939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107152, + "balance_loss_mlp": 1.04195595, + "epoch": 0.6519815313582147, + "flos": 555662135808.0, + "grad_norm": 0.058132495029724875, + "language_loss": 0.8099978, + "learning_rate": 0.0002853867277307162, + "loss": 0.82071304, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.29516602, + "step": 3389, + "time_per_iteration": 2.628153085708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072178, + "balance_loss_mlp": 1.04399705, + "epoch": 0.6521739130434783, + "flos": 480229986816.0, + "grad_norm": 0.062440592290717876, + "language_loss": 0.82432795, + "learning_rate": 0.00028510538537198824, + "loss": 0.83504969, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.28198242, + "step": 3390, + "time_per_iteration": 2.6273562908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076807, + "balance_loss_mlp": 1.04805326, + "epoch": 0.6523662947287419, + "flos": 665380887552.0, + "grad_norm": 0.0630008208317628, + "language_loss": 0.86511409, + "learning_rate": 0.00028482412644274867, + "loss": 0.87588215, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.28759766, + "step": 3391, + "time_per_iteration": 2.986837148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073216, + "balance_loss_mlp": 1.04479647, + "epoch": 0.6525586764140053, + "flos": 548394499584.0, + "grad_norm": 0.07544653210913753, + "language_loss": 0.74115705, + "learning_rate": 0.00028454295105219207, + "loss": 0.75188923, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.28417969, + "step": 3392, + "time_per_iteration": 2.6882169246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077343, + "balance_loss_mlp": 1.04837489, + "epoch": 0.6527510580992689, + "flos": 802529012736.0, + "grad_norm": 0.044597775660838994, + "language_loss": 0.79517299, + "learning_rate": 0.0002842618593094802, + "loss": 0.80594641, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.28979492, + "step": 3393, + "time_per_iteration": 3.160513401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076464, + "balance_loss_mlp": 1.04785347, + "epoch": 0.6529434397845325, + "flos": 670864757760.0, + "grad_norm": 0.0655151623947296, + "language_loss": 0.80225992, + "learning_rate": 0.00028398085132374243, + "loss": 0.81302458, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.28588867, + "step": 3394, + "time_per_iteration": 2.799607753753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04861116, + "epoch": 0.6531358214697961, + "flos": 828043610112.0, + "grad_norm": 0.057447645264245936, + "language_loss": 0.83968282, + "learning_rate": 0.0002836999272040761, + "loss": 0.85044694, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.27832031, + "step": 3395, + "time_per_iteration": 3.1404569149017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076476, + "balance_loss_mlp": 1.04753208, + "epoch": 0.6533282031550597, + "flos": 487157179392.0, + "grad_norm": 0.07221192979592671, + "language_loss": 0.83835298, + "learning_rate": 0.00028341908705954575, + "loss": 0.84911776, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.28955078, + "step": 3396, + "time_per_iteration": 2.586735248565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024797, + "balance_loss_mlp": 1.01340032, + "epoch": 0.6535205848403232, + "flos": 1556908121088.0, + "grad_norm": 0.010103591992015052, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82786608, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.11376953, + "step": 3397, + "time_per_iteration": 4.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076371, + "balance_loss_mlp": 1.04754591, + "epoch": 0.6537129665255867, + "flos": 493464531456.0, + "grad_norm": 0.06325367812107179, + "language_loss": 0.78003663, + "learning_rate": 0.00028285765913198604, + "loss": 0.79080033, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.2878418, + "step": 3398, + "time_per_iteration": 2.583195209503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073367, + "balance_loss_mlp": 1.04530561, + "epoch": 0.6539053482108503, + "flos": 604718327808.0, + "grad_norm": 0.055960254103937936, + "language_loss": 0.81894422, + "learning_rate": 0.0002825770715669227, + "loss": 0.82967794, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.28076172, + "step": 3399, + "time_per_iteration": 2.706880569458008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075602, + "balance_loss_mlp": 1.04842257, + "epoch": 0.6540977298961139, + "flos": 577504332288.0, + "grad_norm": 0.06150139712068683, + "language_loss": 0.80872452, + "learning_rate": 0.00028229656841292634, + "loss": 0.81948054, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.2722168, + "step": 3400, + "time_per_iteration": 2.6799252033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075202, + "balance_loss_mlp": 1.04687786, + "epoch": 0.6542901115813774, + "flos": 511500496896.0, + "grad_norm": 0.0638413236687058, + "language_loss": 0.76758403, + "learning_rate": 0.0002820161497788979, + "loss": 0.77833605, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.28320312, + "step": 3401, + "time_per_iteration": 2.6050028800964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_mlp": 1.04712176, + "epoch": 0.654482493266641, + "flos": 625201569792.0, + "grad_norm": 0.051478933847507014, + "language_loss": 0.87136239, + "learning_rate": 0.00028173581577370545, + "loss": 0.88210893, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.27563477, + "step": 3402, + "time_per_iteration": 2.7428696155548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107348, + "balance_loss_mlp": 1.04618084, + "epoch": 0.6546748749519046, + "flos": 523712148480.0, + "grad_norm": 0.05196967996925013, + "language_loss": 0.79016143, + "learning_rate": 0.0002814555665061844, + "loss": 0.80089623, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.2734375, + "step": 3403, + "time_per_iteration": 2.68853759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076544, + "balance_loss_mlp": 1.04914951, + "epoch": 0.6548672566371682, + "flos": 478945225728.0, + "grad_norm": 0.06812490536784549, + "language_loss": 0.77581179, + "learning_rate": 0.00028117540208513715, + "loss": 0.78657722, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.27416992, + "step": 3404, + "time_per_iteration": 2.668957233428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074239, + "balance_loss_mlp": 1.0468924, + "epoch": 0.6550596383224317, + "flos": 615732558336.0, + "grad_norm": 0.06109241421727743, + "language_loss": 0.85329819, + "learning_rate": 0.00028089532261933313, + "loss": 0.86404049, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.27368164, + "step": 3405, + "time_per_iteration": 2.764646053314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077427, + "balance_loss_mlp": 1.04910326, + "epoch": 0.6552520200076952, + "flos": 488594709504.0, + "grad_norm": 0.07801432785219843, + "language_loss": 0.85569102, + "learning_rate": 0.0002806153282175087, + "loss": 0.86646521, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.28369141, + "step": 3406, + "time_per_iteration": 2.612542152404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073707, + "balance_loss_mlp": 1.04547811, + "epoch": 0.6554444016929588, + "flos": 687310424064.0, + "grad_norm": 0.06580250942385472, + "language_loss": 0.82821441, + "learning_rate": 0.0002803354189883679, + "loss": 0.83895147, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.28222656, + "step": 3407, + "time_per_iteration": 2.8573250770568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079984, + "balance_loss_mlp": 1.0526377, + "epoch": 0.6556367833782224, + "flos": 542772417024.0, + "grad_norm": 0.04760286447801195, + "language_loss": 0.8549965, + "learning_rate": 0.00028005559504058053, + "loss": 0.86579633, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.27392578, + "step": 3408, + "time_per_iteration": 2.723130941390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075013, + "balance_loss_mlp": 1.04623616, + "epoch": 0.655829165063486, + "flos": 673237840896.0, + "grad_norm": 0.05982952663886069, + "language_loss": 0.76448226, + "learning_rate": 0.0002797758564827838, + "loss": 0.77523243, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.28759766, + "step": 3409, + "time_per_iteration": 2.8227314949035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077669, + "balance_loss_mlp": 1.04989326, + "epoch": 0.6560215467487496, + "flos": 531550162944.0, + "grad_norm": 0.0665853509575856, + "language_loss": 0.83799911, + "learning_rate": 0.0002794962034235824, + "loss": 0.8487758, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.27783203, + "step": 3410, + "time_per_iteration": 2.6031951904296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04303622, + "epoch": 0.656213928434013, + "flos": 591025476096.0, + "grad_norm": 0.05829437169655771, + "language_loss": 0.74215448, + "learning_rate": 0.00027921663597154695, + "loss": 0.75286669, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.28198242, + "step": 3411, + "time_per_iteration": 2.735642910003662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077612, + "balance_loss_mlp": 1.04981232, + "epoch": 0.6564063101192766, + "flos": 415564756992.0, + "grad_norm": 0.0845273006742278, + "language_loss": 0.8108443, + "learning_rate": 0.00027893715423521525, + "loss": 0.8216204, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.27832031, + "step": 3412, + "time_per_iteration": 2.4407780170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079935, + "balance_loss_mlp": 1.05134881, + "epoch": 0.6565986918045402, + "flos": 453084392448.0, + "grad_norm": 0.06735556448920854, + "language_loss": 0.83940005, + "learning_rate": 0.00027865775832309163, + "loss": 0.85019946, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.28564453, + "step": 3413, + "time_per_iteration": 2.6473381519317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076667, + "balance_loss_mlp": 1.04870033, + "epoch": 0.6567910734898038, + "flos": 547483677696.0, + "grad_norm": 0.0593593517708546, + "language_loss": 0.85890168, + "learning_rate": 0.00027837844834364733, + "loss": 0.86966836, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.27978516, + "step": 3414, + "time_per_iteration": 2.632337808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074793, + "balance_loss_mlp": 1.04663622, + "epoch": 0.6569834551750673, + "flos": 655207667712.0, + "grad_norm": 0.056143783747438114, + "language_loss": 0.86344767, + "learning_rate": 0.00027809922440532, + "loss": 0.87419558, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.28173828, + "step": 3415, + "time_per_iteration": 2.8158276081085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070852, + "balance_loss_mlp": 1.04152656, + "epoch": 0.6571758368603309, + "flos": 539399761920.0, + "grad_norm": 0.052293686608573205, + "language_loss": 0.80653661, + "learning_rate": 0.00027782008661651406, + "loss": 0.81724513, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.29272461, + "step": 3416, + "time_per_iteration": 2.769740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075321, + "balance_loss_mlp": 1.04706836, + "epoch": 0.6573682185455945, + "flos": 497088880128.0, + "grad_norm": 0.047338775202516, + "language_loss": 0.87086004, + "learning_rate": 0.00027754103508560013, + "loss": 0.88161325, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.2824707, + "step": 3417, + "time_per_iteration": 2.5982823371887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070746, + "balance_loss_mlp": 1.04204035, + "epoch": 0.657560600230858, + "flos": 447244111872.0, + "grad_norm": 0.07606703809766882, + "language_loss": 0.82847452, + "learning_rate": 0.0002772620699209163, + "loss": 0.83918196, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.28686523, + "step": 3418, + "time_per_iteration": 2.5715713500976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072273, + "balance_loss_mlp": 1.04387712, + "epoch": 0.6577529819161216, + "flos": 481696630272.0, + "grad_norm": 0.06477726519797523, + "language_loss": 0.79822147, + "learning_rate": 0.0002769831912307658, + "loss": 0.80894423, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.28393555, + "step": 3419, + "time_per_iteration": 2.554229974746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081387, + "balance_loss_mlp": 1.05339622, + "epoch": 0.6579453636013851, + "flos": 530589878784.0, + "grad_norm": 0.06482840979987209, + "language_loss": 0.80168855, + "learning_rate": 0.00027670439912341917, + "loss": 0.81250238, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.2800293, + "step": 3420, + "time_per_iteration": 2.6077942848205566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083063, + "balance_loss_mlp": 1.05385685, + "epoch": 0.6581377452866487, + "flos": 627737596416.0, + "grad_norm": 0.062198061395391364, + "language_loss": 0.83608246, + "learning_rate": 0.0002764256937071129, + "loss": 0.8469131, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.29199219, + "step": 3421, + "time_per_iteration": 2.7814555168151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079993, + "balance_loss_mlp": 1.0516932, + "epoch": 0.6583301269719123, + "flos": 548355211776.0, + "grad_norm": 0.06741584728715999, + "language_loss": 0.87078255, + "learning_rate": 0.00027614707509005036, + "loss": 0.8815825, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.28320312, + "step": 3422, + "time_per_iteration": 2.6582610607147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080132, + "balance_loss_mlp": 1.05216599, + "epoch": 0.6585225086571759, + "flos": 427268639232.0, + "grad_norm": 0.05422221992549149, + "language_loss": 0.79046404, + "learning_rate": 0.0002758685433804008, + "loss": 0.80126542, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.2800293, + "step": 3423, + "time_per_iteration": 2.518541097640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080526, + "balance_loss_mlp": 1.05196333, + "epoch": 0.6587148903424394, + "flos": 859264657920.0, + "grad_norm": 0.07879518089190286, + "language_loss": 0.79578894, + "learning_rate": 0.00027559009868630005, + "loss": 0.80659419, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.28564453, + "step": 3424, + "time_per_iteration": 3.0996036529541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079504, + "balance_loss_mlp": 1.0518713, + "epoch": 0.6589072720277029, + "flos": 805280417280.0, + "grad_norm": 0.05918528826128724, + "language_loss": 0.79852736, + "learning_rate": 0.0002753117411158491, + "loss": 0.80932236, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.27661133, + "step": 3425, + "time_per_iteration": 3.0297467708587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082154, + "balance_loss_mlp": 1.05392551, + "epoch": 0.6590996537129665, + "flos": 548355211776.0, + "grad_norm": 0.05414938091888711, + "language_loss": 0.89781225, + "learning_rate": 0.0002750334707771168, + "loss": 0.90863383, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.2824707, + "step": 3426, + "time_per_iteration": 2.639045476913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082665, + "balance_loss_mlp": 1.05364943, + "epoch": 0.6592920353982301, + "flos": 453931195392.0, + "grad_norm": 0.06850883476210408, + "language_loss": 0.8080318, + "learning_rate": 0.0002747552877781369, + "loss": 0.81885844, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.28979492, + "step": 3427, + "time_per_iteration": 2.49623966217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077382, + "balance_loss_mlp": 1.04967833, + "epoch": 0.6594844170834937, + "flos": 566903328768.0, + "grad_norm": 0.05956339540339285, + "language_loss": 0.81955504, + "learning_rate": 0.0002744771922269097, + "loss": 0.83032882, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.27709961, + "step": 3428, + "time_per_iteration": 2.730713129043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071709, + "balance_loss_mlp": 1.04276502, + "epoch": 0.6596767987687572, + "flos": 1187452016640.0, + "grad_norm": 0.06328482299945191, + "language_loss": 0.82119536, + "learning_rate": 0.0002741991842314015, + "loss": 0.83191252, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.28930664, + "step": 3429, + "time_per_iteration": 3.479928970336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072277, + "balance_loss_mlp": 1.04433429, + "epoch": 0.6598691804540208, + "flos": 503247845376.0, + "grad_norm": 0.05605661810668252, + "language_loss": 0.85796869, + "learning_rate": 0.0002739212638995445, + "loss": 0.86869144, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.27954102, + "step": 3430, + "time_per_iteration": 2.606570243835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074192, + "balance_loss_mlp": 1.04579639, + "epoch": 0.6600615621392844, + "flos": 531072916992.0, + "grad_norm": 0.06049343964764478, + "language_loss": 0.82845342, + "learning_rate": 0.00027364343133923696, + "loss": 0.83919537, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.28393555, + "step": 3431, + "time_per_iteration": 2.670698642730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010757, + "balance_loss_mlp": 1.04632664, + "epoch": 0.6602539438245479, + "flos": 565170435072.0, + "grad_norm": 0.060306061289427934, + "language_loss": 0.8290168, + "learning_rate": 0.0002733656866583431, + "loss": 0.83977377, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.29321289, + "step": 3432, + "time_per_iteration": 2.6917898654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107317, + "balance_loss_mlp": 1.04413056, + "epoch": 0.6604463255098114, + "flos": 856802824704.0, + "grad_norm": 0.07899452936934231, + "language_loss": 0.83071327, + "learning_rate": 0.0002730880299646927, + "loss": 0.84144497, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.2902832, + "step": 3433, + "time_per_iteration": 3.028512954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.03898394, + "epoch": 0.660638707195075, + "flos": 674158837248.0, + "grad_norm": 0.05867349384550741, + "language_loss": 0.85263318, + "learning_rate": 0.0002728104613660821, + "loss": 0.86331582, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.29272461, + "step": 3434, + "time_per_iteration": 2.8600428104400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107468, + "balance_loss_mlp": 1.04666591, + "epoch": 0.6608310888803386, + "flos": 888572339712.0, + "grad_norm": 0.08754685065456504, + "language_loss": 0.82922065, + "learning_rate": 0.0002725329809702729, + "loss": 0.83996743, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.28051758, + "step": 3435, + "time_per_iteration": 3.2159268856048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068471, + "balance_loss_mlp": 1.04002786, + "epoch": 0.6610234705656022, + "flos": 1135909260288.0, + "grad_norm": 0.06770839009461412, + "language_loss": 0.76381433, + "learning_rate": 0.0002722555888849921, + "loss": 0.77449906, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.28417969, + "step": 3436, + "time_per_iteration": 3.435774564743042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071105, + "balance_loss_mlp": 1.04297185, + "epoch": 0.6612158522508658, + "flos": 467776816128.0, + "grad_norm": 0.05996981510942144, + "language_loss": 0.8029291, + "learning_rate": 0.00027197828521793334, + "loss": 0.81364018, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.28125, + "step": 3437, + "time_per_iteration": 2.5626087188720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010765, + "balance_loss_mlp": 1.04681671, + "epoch": 0.6614082339361292, + "flos": 571374480384.0, + "grad_norm": 0.059440388308285685, + "language_loss": 0.84535551, + "learning_rate": 0.0002717010700767552, + "loss": 0.85612053, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.29614258, + "step": 3438, + "time_per_iteration": 2.74114990234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071656, + "balance_loss_mlp": 1.04254496, + "epoch": 0.6616006156213928, + "flos": 498220872192.0, + "grad_norm": 0.07105561276386183, + "language_loss": 0.7574169, + "learning_rate": 0.00027142394356908226, + "loss": 0.76813346, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.29077148, + "step": 3439, + "time_per_iteration": 2.5588772296905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107167, + "balance_loss_mlp": 1.04289341, + "epoch": 0.6617929973066564, + "flos": 602124074496.0, + "grad_norm": 0.061991918055260026, + "language_loss": 0.84383535, + "learning_rate": 0.00027114690580250456, + "loss": 0.85455203, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.2878418, + "step": 3440, + "time_per_iteration": 2.770521879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068436, + "balance_loss_mlp": 1.03996921, + "epoch": 0.66198537899192, + "flos": 522731515392.0, + "grad_norm": 0.055271996541099454, + "language_loss": 0.8711971, + "learning_rate": 0.0002708699568845776, + "loss": 0.88188148, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.28466797, + "step": 3441, + "time_per_iteration": 2.634669303894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020343, + "balance_loss_mlp": 1.00923228, + "epoch": 0.6621777606771835, + "flos": 1565421230592.0, + "grad_norm": 0.011806654304651203, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.80308127, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.11132812, + "step": 3442, + "time_per_iteration": 4.947353363037109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074491, + "balance_loss_mlp": 1.04609489, + "epoch": 0.6623701423624471, + "flos": 526409708544.0, + "grad_norm": 0.055374659837301436, + "language_loss": 0.82784879, + "learning_rate": 0.0002703163260247261, + "loss": 0.83859372, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.28369141, + "step": 3443, + "time_per_iteration": 2.664637804031372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069476, + "balance_loss_mlp": 1.04041255, + "epoch": 0.6625625240477107, + "flos": 527921432064.0, + "grad_norm": 0.06501168506799739, + "language_loss": 0.81707942, + "learning_rate": 0.0002700396442977399, + "loss": 0.82777417, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.2902832, + "step": 3444, + "time_per_iteration": 2.616928815841675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069297, + "balance_loss_mlp": 1.04049635, + "epoch": 0.6627549057329742, + "flos": 472854661632.0, + "grad_norm": 0.054380463480794276, + "language_loss": 0.84038997, + "learning_rate": 0.0002697630518492817, + "loss": 0.85108292, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.28833008, + "step": 3445, + "time_per_iteration": 2.649937868118286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071356, + "balance_loss_mlp": 1.04207826, + "epoch": 0.6629472874182378, + "flos": 527743931904.0, + "grad_norm": 0.06943834744074738, + "language_loss": 0.85656464, + "learning_rate": 0.0002694865487867343, + "loss": 0.86727822, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.29223633, + "step": 3446, + "time_per_iteration": 2.624187707901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072189, + "balance_loss_mlp": 1.04241085, + "epoch": 0.6631396691035013, + "flos": 612906960384.0, + "grad_norm": 0.05377374950460666, + "language_loss": 0.84776872, + "learning_rate": 0.0002692101352174453, + "loss": 0.85849059, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.29736328, + "step": 3447, + "time_per_iteration": 2.786705255508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066769, + "balance_loss_mlp": 1.03823054, + "epoch": 0.6633320507887649, + "flos": 609041092608.0, + "grad_norm": 0.06088849613608419, + "language_loss": 0.84652716, + "learning_rate": 0.00026893381124872787, + "loss": 0.8571949, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.28515625, + "step": 3448, + "time_per_iteration": 2.8100626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072364, + "balance_loss_mlp": 1.04272866, + "epoch": 0.6635244324740285, + "flos": 749342112768.0, + "grad_norm": 0.06845751497679059, + "language_loss": 0.80441087, + "learning_rate": 0.00026865757698786097, + "loss": 0.81513453, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.29589844, + "step": 3449, + "time_per_iteration": 3.046318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069481, + "balance_loss_mlp": 1.04065669, + "epoch": 0.6637168141592921, + "flos": 664222754304.0, + "grad_norm": 0.05206136562356657, + "language_loss": 0.81613761, + "learning_rate": 0.000268381432542088, + "loss": 0.82683241, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.28833008, + "step": 3450, + "time_per_iteration": 2.865903854370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107193, + "balance_loss_mlp": 1.04203212, + "epoch": 0.6639091958445555, + "flos": 606500683776.0, + "grad_norm": 0.0645327848257149, + "language_loss": 0.79875302, + "learning_rate": 0.00026810537801861807, + "loss": 0.80947232, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.29882812, + "step": 3451, + "time_per_iteration": 2.8374693393707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071564, + "balance_loss_mlp": 1.04173839, + "epoch": 0.6641015775298191, + "flos": 476452869120.0, + "grad_norm": 0.05151691249818879, + "language_loss": 0.8142612, + "learning_rate": 0.0002678294135246243, + "loss": 0.82497692, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.2980957, + "step": 3452, + "time_per_iteration": 2.839822769165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072033, + "balance_loss_mlp": 1.04313636, + "epoch": 0.6642939592150827, + "flos": 903746391552.0, + "grad_norm": 0.05848171422306997, + "language_loss": 0.86315292, + "learning_rate": 0.0002675535391672463, + "loss": 0.87387323, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.2890625, + "step": 3453, + "time_per_iteration": 3.184783458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074712, + "balance_loss_mlp": 1.04574442, + "epoch": 0.6644863409003463, + "flos": 581527351296.0, + "grad_norm": 0.06167080451779571, + "language_loss": 0.86087596, + "learning_rate": 0.0002672777550535877, + "loss": 0.8716231, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.28979492, + "step": 3454, + "time_per_iteration": 2.8803153038024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071993, + "balance_loss_mlp": 1.0427866, + "epoch": 0.6646787225856099, + "flos": 478761933312.0, + "grad_norm": 0.05419695506055875, + "language_loss": 0.84890383, + "learning_rate": 0.00026700206129071747, + "loss": 0.85962379, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.29174805, + "step": 3455, + "time_per_iteration": 2.835059881210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076439, + "balance_loss_mlp": 1.04749477, + "epoch": 0.6648711042708734, + "flos": 449676831744.0, + "grad_norm": 0.06321625044537839, + "language_loss": 0.88953322, + "learning_rate": 0.00026672645798566925, + "loss": 0.90029758, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.28930664, + "step": 3456, + "time_per_iteration": 3.0997443199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071835, + "balance_loss_mlp": 1.04277229, + "epoch": 0.665063485956137, + "flos": 858553095168.0, + "grad_norm": 0.055285478182730885, + "language_loss": 0.79457712, + "learning_rate": 0.00026645094524544225, + "loss": 0.80529541, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.2902832, + "step": 3457, + "time_per_iteration": 3.513991117477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107703, + "balance_loss_mlp": 1.0481813, + "epoch": 0.6652558676414005, + "flos": 604024293888.0, + "grad_norm": 0.045511024743111715, + "language_loss": 0.75222224, + "learning_rate": 0.00026617552317699945, + "loss": 0.7629925, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.28833008, + "step": 3458, + "time_per_iteration": 3.5000369548797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_mlp": 1.04062915, + "epoch": 0.6654482493266641, + "flos": 510141542400.0, + "grad_norm": 0.0575678465485099, + "language_loss": 0.8684063, + "learning_rate": 0.0002659001918872693, + "loss": 0.87909818, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.28564453, + "step": 3459, + "time_per_iteration": 3.1579606533050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076447, + "balance_loss_mlp": 1.04797983, + "epoch": 0.6656406310119277, + "flos": 565342142976.0, + "grad_norm": 0.057947477452726895, + "language_loss": 0.80655402, + "learning_rate": 0.0002656249514831449, + "loss": 0.8173185, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.28466797, + "step": 3460, + "time_per_iteration": 3.0136172771453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075105, + "balance_loss_mlp": 1.04527879, + "epoch": 0.6658330126971912, + "flos": 1023859533312.0, + "grad_norm": 0.05880599704270715, + "language_loss": 0.86742055, + "learning_rate": 0.00026534980207148416, + "loss": 0.87817168, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.2980957, + "step": 3461, + "time_per_iteration": 3.808920383453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070751, + "balance_loss_mlp": 1.04256988, + "epoch": 0.6660253943824548, + "flos": 816472147968.0, + "grad_norm": 0.06394653558237288, + "language_loss": 0.73634577, + "learning_rate": 0.0002650747437591097, + "loss": 0.74705327, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.28149414, + "step": 3462, + "time_per_iteration": 3.4438018798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023937, + "balance_loss_mlp": 1.01258874, + "epoch": 0.6662177760677184, + "flos": 1495331767296.0, + "grad_norm": 0.01627441049927099, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82903516, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.11328125, + "step": 3463, + "time_per_iteration": 5.9989097118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069258, + "balance_loss_mlp": 1.04091001, + "epoch": 0.666410157752982, + "flos": 499875190272.0, + "grad_norm": 0.05970416842123876, + "language_loss": 0.86439729, + "learning_rate": 0.00026452490085933155, + "loss": 0.87508994, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.28393555, + "step": 3464, + "time_per_iteration": 3.074321985244751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069725, + "balance_loss_mlp": 1.04099607, + "epoch": 0.6666025394382454, + "flos": 480928402944.0, + "grad_norm": 0.06389669613772958, + "language_loss": 0.89814323, + "learning_rate": 0.00026425011648539614, + "loss": 0.90884054, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.28735352, + "step": 3465, + "time_per_iteration": 3.163724422454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067748, + "balance_loss_mlp": 1.0391376, + "epoch": 0.666794921123509, + "flos": 546395355648.0, + "grad_norm": 0.05866867334399115, + "language_loss": 0.82531869, + "learning_rate": 0.00026397542363768267, + "loss": 0.83599609, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.28588867, + "step": 3466, + "time_per_iteration": 3.15535044670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107073, + "balance_loss_mlp": 1.04202461, + "epoch": 0.6669873028087726, + "flos": 471750372864.0, + "grad_norm": 0.09718909208334105, + "language_loss": 0.81696969, + "learning_rate": 0.0002637008224228362, + "loss": 0.82767701, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.28710938, + "step": 3467, + "time_per_iteration": 3.1590065956115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_mlp": 1.04467225, + "epoch": 0.6671796844940362, + "flos": 547119912960.0, + "grad_norm": 0.045698097527158366, + "language_loss": 0.84370708, + "learning_rate": 0.00026342631294746653, + "loss": 0.85443497, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.28100586, + "step": 3468, + "time_per_iteration": 3.2474896907806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_mlp": 1.03933835, + "epoch": 0.6673720661792998, + "flos": 1069867547136.0, + "grad_norm": 0.048489338364625344, + "language_loss": 0.80841875, + "learning_rate": 0.0002631518953181476, + "loss": 0.81909585, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.28369141, + "step": 3469, + "time_per_iteration": 3.989240884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020296, + "balance_loss_mlp": 1.00837493, + "epoch": 0.6675644478645633, + "flos": 1522963372032.0, + "grad_norm": 0.017053008774153198, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.7734558, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.11914062, + "step": 3470, + "time_per_iteration": 5.7656426429748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079857, + "balance_loss_mlp": 1.05081761, + "epoch": 0.6677568295498268, + "flos": 579410343936.0, + "grad_norm": 0.06105820471136532, + "language_loss": 0.80315661, + "learning_rate": 0.00026260333602377985, + "loss": 0.81395519, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.29003906, + "step": 3471, + "time_per_iteration": 3.3436222076416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072083, + "balance_loss_mlp": 1.04383063, + "epoch": 0.6679492112350904, + "flos": 383722458624.0, + "grad_norm": 0.05421906937668894, + "language_loss": 0.87085468, + "learning_rate": 0.0002623291945717007, + "loss": 0.88157558, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.28271484, + "step": 3472, + "time_per_iteration": 3.1183881759643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071602, + "balance_loss_mlp": 1.04234779, + "epoch": 0.668141592920354, + "flos": 1150297555968.0, + "grad_norm": 0.04666604751333496, + "language_loss": 0.84075844, + "learning_rate": 0.00026205514539161175, + "loss": 0.85147452, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.29248047, + "step": 3473, + "time_per_iteration": 3.790060043334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04386711, + "epoch": 0.6683339746056175, + "flos": 560804000256.0, + "grad_norm": 0.05776060177542925, + "language_loss": 0.84147954, + "learning_rate": 0.00026178118858990773, + "loss": 0.85220551, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.28686523, + "step": 3474, + "time_per_iteration": 3.4138669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071797, + "balance_loss_mlp": 1.04259038, + "epoch": 0.6685263562908811, + "flos": 514051080192.0, + "grad_norm": 0.05528533566381529, + "language_loss": 0.83995008, + "learning_rate": 0.0002615073242729483, + "loss": 0.85066801, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.29223633, + "step": 3475, + "time_per_iteration": 3.199012279510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.0421505, + "epoch": 0.6687187379761447, + "flos": 629466107904.0, + "grad_norm": 0.04758123025754447, + "language_loss": 0.84358716, + "learning_rate": 0.0002612335525470573, + "loss": 0.85429692, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.2878418, + "step": 3476, + "time_per_iteration": 3.4972333908081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074194, + "balance_loss_mlp": 1.04572678, + "epoch": 0.6689111196614083, + "flos": 535312723968.0, + "grad_norm": 0.06222514745321995, + "language_loss": 0.78151464, + "learning_rate": 0.0002609598735185221, + "loss": 0.79225659, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.28466797, + "step": 3477, + "time_per_iteration": 3.1121668815612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.04186535, + "epoch": 0.6691035013466718, + "flos": 602758471680.0, + "grad_norm": 0.05831077718695847, + "language_loss": 0.83306509, + "learning_rate": 0.00026068628729359445, + "loss": 0.84377104, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.28686523, + "step": 3478, + "time_per_iteration": 3.4748337268829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075594, + "balance_loss_mlp": 1.04653037, + "epoch": 0.6692958830319353, + "flos": 632539017216.0, + "grad_norm": 0.053072339735848705, + "language_loss": 0.75823909, + "learning_rate": 0.00026041279397848996, + "loss": 0.76899505, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.29003906, + "step": 3479, + "time_per_iteration": 3.3513095378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071758, + "balance_loss_mlp": 1.04279053, + "epoch": 0.6694882647171989, + "flos": 645153721344.0, + "grad_norm": 0.11523786601732237, + "language_loss": 0.82653117, + "learning_rate": 0.00026013939367938797, + "loss": 0.83724874, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.28930664, + "step": 3480, + "time_per_iteration": 3.341496467590332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070446, + "balance_loss_mlp": 1.0417881, + "epoch": 0.6696806464024625, + "flos": 569292378624.0, + "grad_norm": 0.05240024743638074, + "language_loss": 0.81095958, + "learning_rate": 0.00025986608650243204, + "loss": 0.82166409, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.28613281, + "step": 3481, + "time_per_iteration": 3.534395933151245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073143, + "balance_loss_mlp": 1.04417491, + "epoch": 0.6698730280877261, + "flos": 622386146304.0, + "grad_norm": 0.04897639091923761, + "language_loss": 0.79360926, + "learning_rate": 0.0002595928725537293, + "loss": 0.80434066, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.28930664, + "step": 3482, + "time_per_iteration": 3.4163737297058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070599, + "balance_loss_mlp": 1.04179811, + "epoch": 0.6700654097729896, + "flos": 502258447872.0, + "grad_norm": 0.05847572955345742, + "language_loss": 0.88153374, + "learning_rate": 0.0002593197519393509, + "loss": 0.89223981, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.28833008, + "step": 3483, + "time_per_iteration": 3.162363052368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067677, + "balance_loss_mlp": 1.03851843, + "epoch": 0.6702577914582531, + "flos": 623567600640.0, + "grad_norm": 0.04895962963004684, + "language_loss": 0.79643184, + "learning_rate": 0.00025904672476533165, + "loss": 0.80710858, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.29125977, + "step": 3484, + "time_per_iteration": 3.329540967941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.0442394, + "epoch": 0.6704501731435167, + "flos": 456033646080.0, + "grad_norm": 0.055271412051917726, + "language_loss": 0.82509005, + "learning_rate": 0.0002587737911376704, + "loss": 0.8358202, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.28759766, + "step": 3485, + "time_per_iteration": 3.2979683876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04063249, + "epoch": 0.6706425548287803, + "flos": 542973238272.0, + "grad_norm": 0.05525585278416293, + "language_loss": 0.8399781, + "learning_rate": 0.00025850095116232885, + "loss": 0.85067225, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.28759766, + "step": 3486, + "time_per_iteration": 3.26407790184021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069925, + "balance_loss_mlp": 1.04012239, + "epoch": 0.6708349365140439, + "flos": 633631721472.0, + "grad_norm": 0.05884470939634603, + "language_loss": 0.78008693, + "learning_rate": 0.000258228204945233, + "loss": 0.79078615, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.29760742, + "step": 3487, + "time_per_iteration": 3.2713074684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069596, + "balance_loss_mlp": 1.04122472, + "epoch": 0.6710273181993074, + "flos": 640459989504.0, + "grad_norm": 0.08825995079793632, + "language_loss": 0.84371996, + "learning_rate": 0.00025795555259227254, + "loss": 0.85441601, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.28369141, + "step": 3488, + "time_per_iteration": 3.2798845767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107088, + "balance_loss_mlp": 1.04253244, + "epoch": 0.671219699884571, + "flos": 553673166336.0, + "grad_norm": 0.04912618775842026, + "language_loss": 0.8368836, + "learning_rate": 0.00025768299420930046, + "loss": 0.84759241, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.28369141, + "step": 3489, + "time_per_iteration": 3.548513174057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070862, + "balance_loss_mlp": 1.04191756, + "epoch": 0.6714120815698346, + "flos": 731191256064.0, + "grad_norm": 0.0542630721977733, + "language_loss": 0.83150196, + "learning_rate": 0.0002574105299021332, + "loss": 0.84221053, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.28930664, + "step": 3490, + "time_per_iteration": 3.264068365097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072429, + "balance_loss_mlp": 1.04398608, + "epoch": 0.6716044632550981, + "flos": 688344901632.0, + "grad_norm": 0.04887866872345111, + "language_loss": 0.84103191, + "learning_rate": 0.00025713815977655084, + "loss": 0.85175616, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.28466797, + "step": 3491, + "time_per_iteration": 3.480595827102661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067719, + "balance_loss_mlp": 1.03848863, + "epoch": 0.6717968449403616, + "flos": 460391316480.0, + "grad_norm": 0.061790986714500215, + "language_loss": 0.84740448, + "learning_rate": 0.0002568658839382969, + "loss": 0.8580817, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.29199219, + "step": 3492, + "time_per_iteration": 3.149390935897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071869, + "balance_loss_mlp": 1.04366422, + "epoch": 0.6719892266256252, + "flos": 501362182656.0, + "grad_norm": 0.060742623870238814, + "language_loss": 0.84422779, + "learning_rate": 0.00025659370249307814, + "loss": 0.85494649, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.28198242, + "step": 3493, + "time_per_iteration": 3.043328285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067893, + "balance_loss_mlp": 1.03840065, + "epoch": 0.6721816083108888, + "flos": 683223386112.0, + "grad_norm": 0.32090754121455606, + "language_loss": 0.85042375, + "learning_rate": 0.00025632161554656473, + "loss": 0.86110264, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.29492188, + "step": 3494, + "time_per_iteration": 3.370725393295288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_mlp": 1.04256368, + "epoch": 0.6723739899961524, + "flos": 585544578048.0, + "grad_norm": 0.056395041319593345, + "language_loss": 0.82224226, + "learning_rate": 0.00025604962320439017, + "loss": 0.8329578, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.28955078, + "step": 3495, + "time_per_iteration": 3.1383168697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069781, + "balance_loss_mlp": 1.04155231, + "epoch": 0.672566371681416, + "flos": 506336721408.0, + "grad_norm": 0.05570764429404915, + "language_loss": 0.82211316, + "learning_rate": 0.0002557777255721516, + "loss": 0.832811, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.2824707, + "step": 3496, + "time_per_iteration": 3.2747058868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073188, + "balance_loss_mlp": 1.0451498, + "epoch": 0.6727587533666795, + "flos": 535405856256.0, + "grad_norm": 0.06368144256739344, + "language_loss": 0.8063888, + "learning_rate": 0.0002555059227554087, + "loss": 0.81712067, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.28027344, + "step": 3497, + "time_per_iteration": 3.241708278656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078052, + "balance_loss_mlp": 1.04920387, + "epoch": 0.672951135051943, + "flos": 602532919296.0, + "grad_norm": 0.05624574913237251, + "language_loss": 0.77828801, + "learning_rate": 0.00025523421485968453, + "loss": 0.78906852, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.28833008, + "step": 3498, + "time_per_iteration": 3.4185025691986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071507, + "balance_loss_mlp": 1.04327822, + "epoch": 0.6731435167372066, + "flos": 810976693248.0, + "grad_norm": 0.05832714819515366, + "language_loss": 0.85479802, + "learning_rate": 0.00025496260199046585, + "loss": 0.86551309, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.28271484, + "step": 3499, + "time_per_iteration": 3.398684501647949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074374, + "balance_loss_mlp": 1.04531085, + "epoch": 0.6733358984224702, + "flos": 611306486784.0, + "grad_norm": 0.0606160593453579, + "language_loss": 0.84417158, + "learning_rate": 0.000254691084253202, + "loss": 0.85491526, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.29052734, + "step": 3500, + "time_per_iteration": 3.204657554626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075309, + "balance_loss_mlp": 1.04641259, + "epoch": 0.6735282801077337, + "flos": 558636120576.0, + "grad_norm": 0.05651280486547688, + "language_loss": 0.7721619, + "learning_rate": 0.00025441966175330567, + "loss": 0.782915, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.2890625, + "step": 3501, + "time_per_iteration": 3.280398368835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079946, + "balance_loss_mlp": 1.05078757, + "epoch": 0.6737206617929973, + "flos": 672134962176.0, + "grad_norm": 0.09712144532107508, + "language_loss": 0.79372454, + "learning_rate": 0.00025414833459615183, + "loss": 0.804524, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.29174805, + "step": 3502, + "time_per_iteration": 3.221496343612671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079859, + "balance_loss_mlp": 1.0510819, + "epoch": 0.6739130434782609, + "flos": 633148683264.0, + "grad_norm": 0.05864951358988012, + "language_loss": 0.80395651, + "learning_rate": 0.0002538771028870796, + "loss": 0.81475508, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.28759766, + "step": 3503, + "time_per_iteration": 3.3205838203430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075878, + "balance_loss_mlp": 1.04710114, + "epoch": 0.6741054251635245, + "flos": 531171841536.0, + "grad_norm": 0.060463290728931994, + "language_loss": 0.81723624, + "learning_rate": 0.0002536059667313903, + "loss": 0.827995, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.2878418, + "step": 3504, + "time_per_iteration": 3.39898419380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.04415321, + "epoch": 0.674297806848788, + "flos": 542343223296.0, + "grad_norm": 0.056146401144420426, + "language_loss": 0.89261472, + "learning_rate": 0.0002533349262343483, + "loss": 0.90334713, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.29077148, + "step": 3505, + "time_per_iteration": 3.3431026935577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072084, + "balance_loss_mlp": 1.04342639, + "epoch": 0.6744901885340515, + "flos": 463291107840.0, + "grad_norm": 0.0612472301672692, + "language_loss": 0.82005084, + "learning_rate": 0.0002530639815011807, + "loss": 0.83077168, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.28662109, + "step": 3506, + "time_per_iteration": 2.985283374786377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070171, + "balance_loss_mlp": 1.04220426, + "epoch": 0.6746825702193151, + "flos": 631533652992.0, + "grad_norm": 0.059607136715137135, + "language_loss": 0.84537947, + "learning_rate": 0.0002527931326370781, + "loss": 0.85608113, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.27978516, + "step": 3507, + "time_per_iteration": 3.1282057762145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071183, + "balance_loss_mlp": 1.04271555, + "epoch": 0.6748749519045787, + "flos": 670835644416.0, + "grad_norm": 0.05533021024656612, + "language_loss": 0.82755983, + "learning_rate": 0.00025252237974719276, + "loss": 0.83827162, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.28491211, + "step": 3508, + "time_per_iteration": 3.260610580444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066579, + "balance_loss_mlp": 1.03813529, + "epoch": 0.6750673335898423, + "flos": 766756827648.0, + "grad_norm": 0.05860673503825768, + "language_loss": 0.80004764, + "learning_rate": 0.00025225172293664056, + "loss": 0.81071347, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.28442383, + "step": 3509, + "time_per_iteration": 3.373530864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_mlp": 1.00540209, + "epoch": 0.6752597152751059, + "flos": 1511786198016.0, + "grad_norm": 0.014769475443499856, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77950692, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.12158203, + "step": 3510, + "time_per_iteration": 6.158355951309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080364, + "balance_loss_mlp": 1.05111003, + "epoch": 0.6754520969603693, + "flos": 686990329344.0, + "grad_norm": 0.06842841117996161, + "language_loss": 0.84400952, + "learning_rate": 0.00025171069797381106, + "loss": 0.8548131, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.29248047, + "step": 3511, + "time_per_iteration": 3.2980220317840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070527, + "balance_loss_mlp": 1.04234552, + "epoch": 0.6756444786456329, + "flos": 500318940672.0, + "grad_norm": 0.0575194424100886, + "language_loss": 0.81909519, + "learning_rate": 0.00025144033003157864, + "loss": 0.82980049, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.28173828, + "step": 3512, + "time_per_iteration": 3.140373706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071116, + "balance_loss_mlp": 1.04319715, + "epoch": 0.6758368603308965, + "flos": 492357270528.0, + "grad_norm": 0.07351376561683495, + "language_loss": 0.78513837, + "learning_rate": 0.00025117005858876806, + "loss": 0.7958495, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.27978516, + "step": 3513, + "time_per_iteration": 3.3946895599365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070978, + "balance_loss_mlp": 1.04212952, + "epoch": 0.6760292420161601, + "flos": 555657753600.0, + "grad_norm": 0.056817312971520956, + "language_loss": 0.85350752, + "learning_rate": 0.000250899883750308, + "loss": 0.86421728, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.28881836, + "step": 3514, + "time_per_iteration": 3.2081196308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071843, + "balance_loss_mlp": 1.04368556, + "epoch": 0.6762216237014236, + "flos": 607322755584.0, + "grad_norm": 0.05856137084704242, + "language_loss": 0.81469542, + "learning_rate": 0.00025062980562109006, + "loss": 0.82541388, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.28173828, + "step": 3515, + "time_per_iteration": 3.234687566757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070317, + "balance_loss_mlp": 1.04268479, + "epoch": 0.6764140053866872, + "flos": 533501254656.0, + "grad_norm": 0.0684742974897707, + "language_loss": 0.8283475, + "learning_rate": 0.0002503598243059677, + "loss": 0.83905065, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.27685547, + "step": 3516, + "time_per_iteration": 3.276319980621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107498, + "balance_loss_mlp": 1.04684663, + "epoch": 0.6766063870719508, + "flos": 504548573184.0, + "grad_norm": 0.05816726448499056, + "language_loss": 0.80307925, + "learning_rate": 0.0002500899399097568, + "loss": 0.81382906, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.28149414, + "step": 3517, + "time_per_iteration": 3.361901044845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073931, + "balance_loss_mlp": 1.0454638, + "epoch": 0.6767987687572143, + "flos": 512923470336.0, + "grad_norm": 0.06530995059631492, + "language_loss": 0.85096681, + "learning_rate": 0.0002498201525372359, + "loss": 0.86170614, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.28491211, + "step": 3518, + "time_per_iteration": 3.10380220413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010719, + "balance_loss_mlp": 1.04421926, + "epoch": 0.6769911504424779, + "flos": 524780121600.0, + "grad_norm": 0.061284941283787836, + "language_loss": 0.83024853, + "learning_rate": 0.00024955046229314584, + "loss": 0.84096754, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.27709961, + "step": 3519, + "time_per_iteration": 3.1552722454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069226, + "balance_loss_mlp": 1.04195142, + "epoch": 0.6771835321277414, + "flos": 449662275072.0, + "grad_norm": 0.06591388650746736, + "language_loss": 0.87507355, + "learning_rate": 0.00024928086928218947, + "loss": 0.88576579, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.27307129, + "step": 3520, + "time_per_iteration": 3.176281452178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073411, + "balance_loss_mlp": 1.04553986, + "epoch": 0.677375913813005, + "flos": 709020200448.0, + "grad_norm": 0.06204053550598198, + "language_loss": 0.76553816, + "learning_rate": 0.00024901137360903216, + "loss": 0.7762723, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.27905273, + "step": 3521, + "time_per_iteration": 3.2491977214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075413, + "balance_loss_mlp": 1.04773283, + "epoch": 0.6775682954982686, + "flos": 428189635584.0, + "grad_norm": 0.06068405228401802, + "language_loss": 0.80714798, + "learning_rate": 0.00024874197537830115, + "loss": 0.81790209, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.27734375, + "step": 3522, + "time_per_iteration": 3.2800705432891846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069929, + "balance_loss_mlp": 1.04258251, + "epoch": 0.6777606771835322, + "flos": 437677585920.0, + "grad_norm": 0.0705299171766763, + "language_loss": 0.83310688, + "learning_rate": 0.00024847267469458684, + "loss": 0.84380615, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.27392578, + "step": 3523, + "time_per_iteration": 3.044410228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072093, + "balance_loss_mlp": 1.04400754, + "epoch": 0.6779530588687956, + "flos": 775106993664.0, + "grad_norm": 0.05514098679922032, + "language_loss": 0.77547973, + "learning_rate": 0.00024820347166244034, + "loss": 0.78620064, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.28100586, + "step": 3524, + "time_per_iteration": 3.3789007663726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074799, + "balance_loss_mlp": 1.04697526, + "epoch": 0.6781454405540592, + "flos": 571502518272.0, + "grad_norm": 0.05352508807919392, + "language_loss": 0.84795761, + "learning_rate": 0.0002479343663863755, + "loss": 0.85870552, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.27856445, + "step": 3525, + "time_per_iteration": 3.242717742919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072571, + "balance_loss_mlp": 1.04462886, + "epoch": 0.6783378222393228, + "flos": 484788478464.0, + "grad_norm": 0.06320153638070183, + "language_loss": 0.76689994, + "learning_rate": 0.00024766535897086876, + "loss": 0.77762568, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.27929688, + "step": 3526, + "time_per_iteration": 3.28702712059021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107187, + "balance_loss_mlp": 1.04366529, + "epoch": 0.6785302039245864, + "flos": 482592895488.0, + "grad_norm": 0.06947465366955115, + "language_loss": 0.79284716, + "learning_rate": 0.0002473964495203578, + "loss": 0.80356586, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.28222656, + "step": 3527, + "time_per_iteration": 3.2413079738616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107552, + "balance_loss_mlp": 1.0474577, + "epoch": 0.67872258560985, + "flos": 524451262464.0, + "grad_norm": 0.05313281252101078, + "language_loss": 0.8542428, + "learning_rate": 0.0002471276381392425, + "loss": 0.86499798, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.28076172, + "step": 3528, + "time_per_iteration": 3.3680808544158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_mlp": 1.02044225, + "epoch": 0.6789149672951135, + "flos": 1551786605568.0, + "grad_norm": 0.015931191486776266, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79221857, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.12792969, + "step": 3529, + "time_per_iteration": 5.628952741622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069556, + "balance_loss_mlp": 1.04094601, + "epoch": 0.6791073489803771, + "flos": 741088051200.0, + "grad_norm": 0.06736468086197074, + "language_loss": 0.84283829, + "learning_rate": 0.00024659031000260826, + "loss": 0.85353386, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.28588867, + "step": 3530, + "time_per_iteration": 2.8723843097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069609, + "balance_loss_mlp": 1.04080772, + "epoch": 0.6792997306656406, + "flos": 576095915520.0, + "grad_norm": 0.0688001707056691, + "language_loss": 0.80604416, + "learning_rate": 0.0002463217934556985, + "loss": 0.81674021, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.28808594, + "step": 3531, + "time_per_iteration": 2.7234296798706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_mlp": 1.01316202, + "epoch": 0.6794921123509042, + "flos": 1502538356736.0, + "grad_norm": 0.012819798724274224, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.77557838, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.12597656, + "step": 3532, + "time_per_iteration": 4.774993181228638 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069098, + "balance_loss_mlp": 1.04098845, + "epoch": 0.6796844940361677, + "flos": 698620018176.0, + "grad_norm": 0.07494627627994242, + "language_loss": 0.83949304, + "learning_rate": 0.0002457850559259306, + "loss": 0.85018402, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.28125, + "step": 3533, + "time_per_iteration": 2.854862928390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069128, + "balance_loss_mlp": 1.04123271, + "epoch": 0.6798768757214313, + "flos": 552496094208.0, + "grad_norm": 0.05955036314433414, + "language_loss": 0.81432045, + "learning_rate": 0.00024551683515145275, + "loss": 0.82501173, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.27905273, + "step": 3534, + "time_per_iteration": 2.662670612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068932, + "balance_loss_mlp": 1.04084659, + "epoch": 0.6800692574066949, + "flos": 522677670912.0, + "grad_norm": 0.05698546166287553, + "language_loss": 0.86435509, + "learning_rate": 0.0002452487131761014, + "loss": 0.87504447, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.28125, + "step": 3535, + "time_per_iteration": 2.7052507400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068803, + "balance_loss_mlp": 1.0406456, + "epoch": 0.6802616390919585, + "flos": 573747563520.0, + "grad_norm": 0.2007355544417899, + "language_loss": 0.79636157, + "learning_rate": 0.00024498069010397093, + "loss": 0.80704963, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.28198242, + "step": 3536, + "time_per_iteration": 2.6741490364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073159, + "balance_loss_mlp": 1.04452467, + "epoch": 0.6804540207772221, + "flos": 487915232256.0, + "grad_norm": 0.06175774783534356, + "language_loss": 0.85386938, + "learning_rate": 0.00024471276603911697, + "loss": 0.86460102, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.28613281, + "step": 3537, + "time_per_iteration": 2.582512378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_mlp": 1.04049325, + "epoch": 0.6806464024624855, + "flos": 578307465216.0, + "grad_norm": 0.05665258990060116, + "language_loss": 0.79265833, + "learning_rate": 0.0002444449410855572, + "loss": 0.80335104, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.28759766, + "step": 3538, + "time_per_iteration": 2.7172720432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075887, + "balance_loss_mlp": 1.04689479, + "epoch": 0.6808387841477491, + "flos": 553456378368.0, + "grad_norm": 0.04143612880488866, + "language_loss": 0.84057069, + "learning_rate": 0.00024417721534727033, + "loss": 0.85132951, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.29003906, + "step": 3539, + "time_per_iteration": 2.6684606075286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072025, + "balance_loss_mlp": 1.04322374, + "epoch": 0.6810311658330127, + "flos": 426613893120.0, + "grad_norm": 0.07425691047539493, + "language_loss": 0.82827783, + "learning_rate": 0.00024390958892819687, + "loss": 0.83899808, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.28759766, + "step": 3540, + "time_per_iteration": 2.4658186435699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107288, + "balance_loss_mlp": 1.04481781, + "epoch": 0.6812235475182763, + "flos": 571956443136.0, + "grad_norm": 0.05780068585896815, + "language_loss": 0.80981314, + "learning_rate": 0.0002436420619322381, + "loss": 0.82054192, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.28100586, + "step": 3541, + "time_per_iteration": 2.8231966495513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077487, + "balance_loss_mlp": 1.04835224, + "epoch": 0.6814159292035398, + "flos": 501648781824.0, + "grad_norm": 0.05333594930296874, + "language_loss": 0.82771194, + "learning_rate": 0.0002433746344632577, + "loss": 0.83848679, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.29101562, + "step": 3542, + "time_per_iteration": 2.6959166526794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071587, + "balance_loss_mlp": 1.04259515, + "epoch": 0.6816083108888034, + "flos": 765176702976.0, + "grad_norm": 0.224573626709811, + "language_loss": 0.80137914, + "learning_rate": 0.00024310730662508006, + "loss": 0.81209499, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.28955078, + "step": 3543, + "time_per_iteration": 3.0683388710021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075151, + "balance_loss_mlp": 1.04639745, + "epoch": 0.681800692574067, + "flos": 479205683712.0, + "grad_norm": 0.05641923702729484, + "language_loss": 0.87227619, + "learning_rate": 0.0002428400785214911, + "loss": 0.88302767, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.28759766, + "step": 3544, + "time_per_iteration": 2.602978467941284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075917, + "balance_loss_mlp": 1.04830861, + "epoch": 0.6819930742593305, + "flos": 691298537472.0, + "grad_norm": 0.05415791739342902, + "language_loss": 0.82201838, + "learning_rate": 0.00024257295025623794, + "loss": 0.83277762, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.27636719, + "step": 3545, + "time_per_iteration": 2.8973493576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079854, + "balance_loss_mlp": 1.05074358, + "epoch": 0.6821854559445941, + "flos": 677783185920.0, + "grad_norm": 0.05879535961793021, + "language_loss": 0.8075946, + "learning_rate": 0.00024230592193302892, + "loss": 0.81839317, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.29077148, + "step": 3546, + "time_per_iteration": 2.8674380779266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079529, + "balance_loss_mlp": 1.0514431, + "epoch": 0.6823778376298576, + "flos": 461956884480.0, + "grad_norm": 0.05930658835110869, + "language_loss": 0.84390098, + "learning_rate": 0.00024203899365553372, + "loss": 0.85469627, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.28100586, + "step": 3547, + "time_per_iteration": 2.570162773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_mlp": 1.03785849, + "epoch": 0.6825702193151212, + "flos": 1474582427136.0, + "grad_norm": 0.024142362504210636, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.7778371, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.11474609, + "step": 3548, + "time_per_iteration": 4.54862117767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.0492295, + "epoch": 0.6827626010003848, + "flos": 722791627776.0, + "grad_norm": 0.05396480474730288, + "language_loss": 0.82952201, + "learning_rate": 0.00024150543765216848, + "loss": 0.84029901, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.28442383, + "step": 3549, + "time_per_iteration": 2.8922061920166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081348, + "balance_loss_mlp": 1.05261874, + "epoch": 0.6829549826856484, + "flos": 558596832768.0, + "grad_norm": 0.08705135979463063, + "language_loss": 0.83172846, + "learning_rate": 0.00024123881013344352, + "loss": 0.84254193, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.28735352, + "step": 3550, + "time_per_iteration": 2.674441337585449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081968, + "balance_loss_mlp": 1.05381048, + "epoch": 0.6831473643709118, + "flos": 624635573760.0, + "grad_norm": 0.052816648102186906, + "language_loss": 0.79533482, + "learning_rate": 0.00024097228307472202, + "loss": 0.80615449, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.28173828, + "step": 3551, + "time_per_iteration": 2.810211181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108367, + "balance_loss_mlp": 1.0561564, + "epoch": 0.6833397460561754, + "flos": 713553960960.0, + "grad_norm": 0.06537057112409075, + "language_loss": 0.82174456, + "learning_rate": 0.00024070585657947846, + "loss": 0.83258128, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.27563477, + "step": 3552, + "time_per_iteration": 2.903355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108401, + "balance_loss_mlp": 1.05487537, + "epoch": 0.683532127741439, + "flos": 464449241088.0, + "grad_norm": 0.04571103673496298, + "language_loss": 0.85090339, + "learning_rate": 0.00024043953075114934, + "loss": 0.86174351, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.29150391, + "step": 3553, + "time_per_iteration": 2.683868169784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085174, + "balance_loss_mlp": 1.05711174, + "epoch": 0.6837245094267026, + "flos": 581979866112.0, + "grad_norm": 0.06261928817671675, + "language_loss": 0.88604438, + "learning_rate": 0.00024017330569313128, + "loss": 0.89689612, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.28051758, + "step": 3554, + "time_per_iteration": 2.7235445976257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085006, + "balance_loss_mlp": 1.05611026, + "epoch": 0.6839168911119662, + "flos": 793836993024.0, + "grad_norm": 0.05900054168258606, + "language_loss": 0.74906945, + "learning_rate": 0.0002399071815087821, + "loss": 0.75991952, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.28857422, + "step": 3555, + "time_per_iteration": 3.0646519660949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085121, + "balance_loss_mlp": 1.05579519, + "epoch": 0.6841092727972297, + "flos": 579734820864.0, + "grad_norm": 0.06151916899658477, + "language_loss": 0.84067833, + "learning_rate": 0.00023964115830142025, + "loss": 0.85152954, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.29321289, + "step": 3556, + "time_per_iteration": 2.670454740524292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086273, + "balance_loss_mlp": 1.05785322, + "epoch": 0.6843016544824932, + "flos": 383530401792.0, + "grad_norm": 0.07044194962998349, + "language_loss": 0.87372839, + "learning_rate": 0.00023937523617432522, + "loss": 0.8845911, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.28393555, + "step": 3557, + "time_per_iteration": 2.442620038986206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079062, + "balance_loss_mlp": 1.05073762, + "epoch": 0.6844940361677568, + "flos": 1438474332672.0, + "grad_norm": 0.11887051887526623, + "language_loss": 0.86776745, + "learning_rate": 0.00023910941523073705, + "loss": 0.8785581, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.28320312, + "step": 3558, + "time_per_iteration": 3.9105570316314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080627, + "balance_loss_mlp": 1.05211186, + "epoch": 0.6846864178530204, + "flos": 520614508032.0, + "grad_norm": 0.05794224336416494, + "language_loss": 0.86635411, + "learning_rate": 0.0002388436955738566, + "loss": 0.87716037, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.28540039, + "step": 3559, + "time_per_iteration": 2.7885656356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010825, + "balance_loss_mlp": 1.05310321, + "epoch": 0.6848787995382839, + "flos": 717626442240.0, + "grad_norm": 0.06653025521174674, + "language_loss": 0.81589997, + "learning_rate": 0.00023857807730684523, + "loss": 0.82672501, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.29394531, + "step": 3560, + "time_per_iteration": 2.8988590240478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082565, + "balance_loss_mlp": 1.05378819, + "epoch": 0.6850711812235475, + "flos": 510787524096.0, + "grad_norm": 0.07668578233950803, + "language_loss": 0.82023144, + "learning_rate": 0.00023831256053282547, + "loss": 0.83105713, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.2878418, + "step": 3561, + "time_per_iteration": 2.644080877304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_mlp": 1.05380273, + "epoch": 0.6852635629088111, + "flos": 667832546304.0, + "grad_norm": 0.07104594234153103, + "language_loss": 0.78454512, + "learning_rate": 0.00023804714535488003, + "loss": 0.79537451, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.29150391, + "step": 3562, + "time_per_iteration": 2.8966143131256104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_mlp": 1.03124619, + "epoch": 0.6854559445940747, + "flos": 1522136918016.0, + "grad_norm": 0.023182514695526305, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80852556, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.11669922, + "step": 3563, + "time_per_iteration": 4.932991027832031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078302, + "balance_loss_mlp": 1.04947758, + "epoch": 0.6856483262793382, + "flos": 453970483200.0, + "grad_norm": 0.05956770996074772, + "language_loss": 0.8101843, + "learning_rate": 0.00023751662019934488, + "loss": 0.82096732, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.2878418, + "step": 3564, + "time_per_iteration": 2.49049711227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080425, + "balance_loss_mlp": 1.05214906, + "epoch": 0.6858407079646017, + "flos": 615269869056.0, + "grad_norm": 0.05086931810535688, + "language_loss": 0.78869629, + "learning_rate": 0.00023725151042772364, + "loss": 0.79950058, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.28271484, + "step": 3565, + "time_per_iteration": 2.7470548152923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079752, + "balance_loss_mlp": 1.04959226, + "epoch": 0.6860330896498653, + "flos": 465793638912.0, + "grad_norm": 0.07206608311036458, + "language_loss": 0.83451784, + "learning_rate": 0.00023698650266411276, + "loss": 0.8453154, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.30102539, + "step": 3566, + "time_per_iteration": 2.6310577392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079469, + "balance_loss_mlp": 1.04949975, + "epoch": 0.6862254713351289, + "flos": 863879814144.0, + "grad_norm": 0.05434580355598899, + "language_loss": 0.83292013, + "learning_rate": 0.00023672159701139755, + "loss": 0.84371483, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.29931641, + "step": 3567, + "time_per_iteration": 3.2131478786468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081005, + "balance_loss_mlp": 1.05160773, + "epoch": 0.6864178530203925, + "flos": 446905078272.0, + "grad_norm": 0.11905493017863943, + "language_loss": 0.8579241, + "learning_rate": 0.00023645679357242296, + "loss": 0.86873412, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.29370117, + "step": 3568, + "time_per_iteration": 2.536799192428589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079259, + "balance_loss_mlp": 1.04881263, + "epoch": 0.6866102347056561, + "flos": 424034196480.0, + "grad_norm": 0.0572051056650869, + "language_loss": 0.83415657, + "learning_rate": 0.00023619209244999534, + "loss": 0.84494913, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.30395508, + "step": 3569, + "time_per_iteration": 2.6000583171844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071372, + "balance_loss_mlp": 1.0414027, + "epoch": 0.6868026163909196, + "flos": 472134486528.0, + "grad_norm": 0.07852810593031194, + "language_loss": 0.84651816, + "learning_rate": 0.0002359274937468806, + "loss": 0.85723186, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.29931641, + "step": 3570, + "time_per_iteration": 2.57413387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_mlp": 1.04479098, + "epoch": 0.6869949980761831, + "flos": 463937089536.0, + "grad_norm": 0.05388106388486604, + "language_loss": 0.77385354, + "learning_rate": 0.00023566299756580512, + "loss": 0.78460878, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.30688477, + "step": 3571, + "time_per_iteration": 2.6366066932678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076128, + "balance_loss_mlp": 1.04491949, + "epoch": 0.6871873797614467, + "flos": 426012991488.0, + "grad_norm": 0.07115585873088184, + "language_loss": 0.78295314, + "learning_rate": 0.0002353986040094551, + "loss": 0.79371446, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.31176758, + "step": 3572, + "time_per_iteration": 2.503833532333374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070084, + "balance_loss_mlp": 1.03882694, + "epoch": 0.6873797614467103, + "flos": 443394210816.0, + "grad_norm": 0.06984885351733894, + "language_loss": 0.79368085, + "learning_rate": 0.00023513431318047796, + "loss": 0.80438167, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.31225586, + "step": 3573, + "time_per_iteration": 2.568976879119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107429, + "balance_loss_mlp": 1.04293847, + "epoch": 0.6875721431319738, + "flos": 991927074816.0, + "grad_norm": 0.060417226210131056, + "language_loss": 0.76676512, + "learning_rate": 0.00023487012518147977, + "loss": 0.77750802, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.31323242, + "step": 3574, + "time_per_iteration": 3.229848861694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069454, + "balance_loss_mlp": 1.03836417, + "epoch": 0.6877645248172374, + "flos": 1285031900160.0, + "grad_norm": 0.06028735388663287, + "language_loss": 0.84485316, + "learning_rate": 0.00023460604011502772, + "loss": 0.85554767, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.31054688, + "step": 3575, + "time_per_iteration": 3.6276612281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_mlp": 1.03640747, + "epoch": 0.687956906502501, + "flos": 876360688128.0, + "grad_norm": 0.059284706265635014, + "language_loss": 0.85573983, + "learning_rate": 0.00023434205808364845, + "loss": 0.8664217, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.31762695, + "step": 3576, + "time_per_iteration": 3.154609203338623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073627, + "balance_loss_mlp": 1.04146445, + "epoch": 0.6881492881877646, + "flos": 563038871040.0, + "grad_norm": 0.06862311945477588, + "language_loss": 0.85635597, + "learning_rate": 0.00023407817918982932, + "loss": 0.86709225, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.3215332, + "step": 3577, + "time_per_iteration": 2.770382881164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065226, + "balance_loss_mlp": 1.03480327, + "epoch": 0.6883416698730281, + "flos": 794782720512.0, + "grad_norm": 0.05501523594648703, + "language_loss": 0.78652638, + "learning_rate": 0.00023381440353601718, + "loss": 0.79717863, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.30371094, + "step": 3578, + "time_per_iteration": 3.0038936138153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068554, + "balance_loss_mlp": 1.03674912, + "epoch": 0.6885340515582916, + "flos": 723308161536.0, + "grad_norm": 0.07314782332090318, + "language_loss": 0.85671222, + "learning_rate": 0.00023355073122461822, + "loss": 0.86739773, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.31787109, + "step": 3579, + "time_per_iteration": 2.901097059249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068864, + "balance_loss_mlp": 1.03798902, + "epoch": 0.6887264332435552, + "flos": 1010529036288.0, + "grad_norm": 0.05988205540841198, + "language_loss": 0.82838941, + "learning_rate": 0.00023328716235799973, + "loss": 0.83907801, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.30834961, + "step": 3580, + "time_per_iteration": 3.3144712448120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_mlp": 1.03734803, + "epoch": 0.6889188149288188, + "flos": 584993138688.0, + "grad_norm": 0.05209228569629584, + "language_loss": 0.83578706, + "learning_rate": 0.00023302369703848803, + "loss": 0.84647214, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.3112793, + "step": 3581, + "time_per_iteration": 2.7352983951568604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072888, + "balance_loss_mlp": 1.04153562, + "epoch": 0.6891111966140824, + "flos": 635831686656.0, + "grad_norm": 0.06738914955836864, + "language_loss": 0.80107218, + "learning_rate": 0.00023276033536836937, + "loss": 0.81180108, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.31323242, + "step": 3582, + "time_per_iteration": 2.8315579891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069685, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6893035782993459, + "flos": 495011160576.0, + "grad_norm": 0.07822330365866909, + "language_loss": 0.84485823, + "learning_rate": 0.00023249707744988984, + "loss": 0.85555506, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.31176758, + "step": 3583, + "time_per_iteration": 2.6693801879882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_mlp": 1.03927565, + "epoch": 0.6894959599846094, + "flos": 457983327744.0, + "grad_norm": 0.09035135761218806, + "language_loss": 0.82157326, + "learning_rate": 0.00023223392338525529, + "loss": 0.83227813, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.31176758, + "step": 3584, + "time_per_iteration": 2.6018331050872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071087, + "balance_loss_mlp": 1.03997374, + "epoch": 0.689688341669873, + "flos": 504740630016.0, + "grad_norm": 0.07744993578546541, + "language_loss": 0.78292501, + "learning_rate": 0.00023197087327663107, + "loss": 0.79363585, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.31079102, + "step": 3585, + "time_per_iteration": 2.6550607681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073164, + "balance_loss_mlp": 1.04259896, + "epoch": 0.6898807233551366, + "flos": 763584993792.0, + "grad_norm": 0.06125478015545225, + "language_loss": 0.80901551, + "learning_rate": 0.00023170792722614243, + "loss": 0.81974715, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.30541992, + "step": 3586, + "time_per_iteration": 2.9460513591766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071475, + "balance_loss_mlp": 1.04057574, + "epoch": 0.6900731050404002, + "flos": 583030310400.0, + "grad_norm": 0.05047941445610664, + "language_loss": 0.83664584, + "learning_rate": 0.00023144508533587377, + "loss": 0.84736061, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.30859375, + "step": 3587, + "time_per_iteration": 2.856055498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073863, + "balance_loss_mlp": 1.04320216, + "epoch": 0.6902654867256637, + "flos": 711531495936.0, + "grad_norm": 0.06477764746614291, + "language_loss": 0.78527439, + "learning_rate": 0.0002311823477078698, + "loss": 0.796013, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.30615234, + "step": 3588, + "time_per_iteration": 3.003086805343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075947, + "balance_loss_mlp": 1.04569197, + "epoch": 0.6904578684109273, + "flos": 596816294400.0, + "grad_norm": 0.08587382139418309, + "language_loss": 0.8476119, + "learning_rate": 0.00023091971444413428, + "loss": 0.85837138, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.30224609, + "step": 3589, + "time_per_iteration": 2.81282114982605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080015, + "balance_loss_mlp": 1.04909205, + "epoch": 0.6906502500961909, + "flos": 584757411840.0, + "grad_norm": 0.06247314370450002, + "language_loss": 0.82250512, + "learning_rate": 0.00023065718564663012, + "loss": 0.83330524, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.30883789, + "step": 3590, + "time_per_iteration": 2.7536580562591553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_mlp": 1.02656031, + "epoch": 0.6908426317814544, + "flos": 1587001559040.0, + "grad_norm": 0.017663884765429294, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.74949831, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.11669922, + "step": 3591, + "time_per_iteration": 4.997710704803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076225, + "balance_loss_mlp": 1.04732895, + "epoch": 0.6910350134667179, + "flos": 500525554176.0, + "grad_norm": 0.06051074258589463, + "language_loss": 0.80712819, + "learning_rate": 0.0002301324418579666, + "loss": 0.81789041, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.28881836, + "step": 3592, + "time_per_iteration": 2.6742522716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_mlp": 1.02309299, + "epoch": 0.6912273951519815, + "flos": 1408462138368.0, + "grad_norm": 0.018187638305653092, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79723203, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.11621094, + "step": 3593, + "time_per_iteration": 4.769122123718262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077785, + "balance_loss_mlp": 1.04865015, + "epoch": 0.6914197768372451, + "flos": 634961562624.0, + "grad_norm": 0.06768771188848043, + "language_loss": 0.80975646, + "learning_rate": 0.00022960811715677415, + "loss": 0.82053435, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.29101562, + "step": 3594, + "time_per_iteration": 2.8826262950897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073934, + "balance_loss_mlp": 1.04472804, + "epoch": 0.6916121585225087, + "flos": 557755822080.0, + "grad_norm": 0.06319085560184597, + "language_loss": 0.81575662, + "learning_rate": 0.00022934611221845608, + "loss": 0.82649601, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.29150391, + "step": 3595, + "time_per_iteration": 2.8295226097106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076251, + "balance_loss_mlp": 1.04663992, + "epoch": 0.6918045402077723, + "flos": 528887508480.0, + "grad_norm": 0.06812021191327418, + "language_loss": 0.7816391, + "learning_rate": 0.00022908421235729609, + "loss": 0.79240167, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.29589844, + "step": 3596, + "time_per_iteration": 2.6967883110046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072978, + "balance_loss_mlp": 1.04343832, + "epoch": 0.6919969218930357, + "flos": 570083927040.0, + "grad_norm": 0.05588162703096273, + "language_loss": 0.85190284, + "learning_rate": 0.0002288224176749728, + "loss": 0.86263263, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.29492188, + "step": 3597, + "time_per_iteration": 2.640408515930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076769, + "balance_loss_mlp": 1.04775333, + "epoch": 0.6921893035782993, + "flos": 683006598144.0, + "grad_norm": 0.0641823490668264, + "language_loss": 0.78313982, + "learning_rate": 0.00022856072827312385, + "loss": 0.79390752, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.28979492, + "step": 3598, + "time_per_iteration": 2.840587854385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072445, + "balance_loss_mlp": 1.0432148, + "epoch": 0.6923816852635629, + "flos": 546484105728.0, + "grad_norm": 0.07324523845521881, + "language_loss": 0.76861233, + "learning_rate": 0.00022829914425334598, + "loss": 0.77933681, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.29223633, + "step": 3599, + "time_per_iteration": 2.6705574989318848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068561, + "balance_loss_mlp": 1.03871107, + "epoch": 0.6925740669488265, + "flos": 509782159872.0, + "grad_norm": 0.06707330247170458, + "language_loss": 0.80270433, + "learning_rate": 0.0002280376657171956, + "loss": 0.8133899, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.2980957, + "step": 3600, + "time_per_iteration": 2.691218852996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070739, + "balance_loss_mlp": 1.04091287, + "epoch": 0.69276644863409, + "flos": 869053764096.0, + "grad_norm": 0.05961595039117338, + "language_loss": 0.76559889, + "learning_rate": 0.00022777629276618706, + "loss": 0.77630627, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.2980957, + "step": 3601, + "time_per_iteration": 3.166266679763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073223, + "balance_loss_mlp": 1.0433017, + "epoch": 0.6929588303193536, + "flos": 625486758912.0, + "grad_norm": 0.05590734740319096, + "language_loss": 0.7759192, + "learning_rate": 0.0002275150255017947, + "loss": 0.78665143, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.29882812, + "step": 3602, + "time_per_iteration": 2.8251349925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018234, + "balance_loss_mlp": 1.00593138, + "epoch": 0.6931512120046172, + "flos": 1544530553856.0, + "grad_norm": 0.021195340578823645, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76750904, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.12304688, + "step": 3603, + "time_per_iteration": 4.9793617725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015265, + "balance_loss_mlp": 1.00286758, + "epoch": 0.6933435936898807, + "flos": 1447460001792.0, + "grad_norm": 0.02110962500083285, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76142371, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.12353516, + "step": 3604, + "time_per_iteration": 4.700538873672485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072674, + "balance_loss_mlp": 1.04251432, + "epoch": 0.6935359753751443, + "flos": 540639442944.0, + "grad_norm": 0.0788112373404933, + "language_loss": 0.8439424, + "learning_rate": 0.0002267318588424379, + "loss": 0.85466921, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.30151367, + "step": 3605, + "time_per_iteration": 2.595975160598755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067214, + "balance_loss_mlp": 1.03688753, + "epoch": 0.6937283570604078, + "flos": 719074146816.0, + "grad_norm": 0.060784014113104926, + "language_loss": 0.87543291, + "learning_rate": 0.00022647101533842845, + "loss": 0.88610506, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.30297852, + "step": 3606, + "time_per_iteration": 2.8924877643585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04255819, + "epoch": 0.6939207387456714, + "flos": 521909443584.0, + "grad_norm": 0.06196096561897257, + "language_loss": 0.76276547, + "learning_rate": 0.00022621027802778872, + "loss": 0.77349472, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.30322266, + "step": 3607, + "time_per_iteration": 2.625544309616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064019, + "balance_loss_mlp": 1.03402638, + "epoch": 0.694113120430935, + "flos": 535100318208.0, + "grad_norm": 0.05568531242453984, + "language_loss": 0.78539741, + "learning_rate": 0.00022594964701174586, + "loss": 0.79603761, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.29956055, + "step": 3608, + "time_per_iteration": 2.617882490158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.04363918, + "epoch": 0.6943055021161986, + "flos": 523101072384.0, + "grad_norm": 0.06276821144872391, + "language_loss": 0.84534574, + "learning_rate": 0.00022568912239148586, + "loss": 0.8560816, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.29882812, + "step": 3609, + "time_per_iteration": 2.6177947521209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068336, + "balance_loss_mlp": 1.03836668, + "epoch": 0.694497883801462, + "flos": 484637119488.0, + "grad_norm": 0.056081647762310796, + "language_loss": 0.81555855, + "learning_rate": 0.00022542870426815344, + "loss": 0.82624191, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.29907227, + "step": 3610, + "time_per_iteration": 2.7079262733459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065817, + "balance_loss_mlp": 1.03646755, + "epoch": 0.6946902654867256, + "flos": 461238119424.0, + "grad_norm": 0.0593152321810988, + "language_loss": 0.85921854, + "learning_rate": 0.00022516839274285173, + "loss": 0.86987674, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.29321289, + "step": 3611, + "time_per_iteration": 2.5142312049865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068225, + "balance_loss_mlp": 1.03689671, + "epoch": 0.6948826471719892, + "flos": 512603375616.0, + "grad_norm": 0.07495855617451591, + "language_loss": 0.75130123, + "learning_rate": 0.00022490818791664265, + "loss": 0.76198351, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.31298828, + "step": 3612, + "time_per_iteration": 2.6149849891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067927, + "balance_loss_mlp": 1.03771973, + "epoch": 0.6950750288572528, + "flos": 556917783552.0, + "grad_norm": 0.05072032327743767, + "language_loss": 0.85225737, + "learning_rate": 0.00022464808989054676, + "loss": 0.86293662, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.30151367, + "step": 3613, + "time_per_iteration": 2.6458423137664795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062852, + "balance_loss_mlp": 1.03331208, + "epoch": 0.6952674105425164, + "flos": 542215185408.0, + "grad_norm": 0.07224132209133893, + "language_loss": 0.76020145, + "learning_rate": 0.00022438809876554284, + "loss": 0.77082992, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.29516602, + "step": 3614, + "time_per_iteration": 2.6633236408233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106639, + "balance_loss_mlp": 1.03720808, + "epoch": 0.6954597922277799, + "flos": 546465166848.0, + "grad_norm": 0.05675110425477687, + "language_loss": 0.80015868, + "learning_rate": 0.00022412821464256873, + "loss": 0.81082261, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.29174805, + "step": 3615, + "time_per_iteration": 2.726789712905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063431, + "balance_loss_mlp": 1.03396273, + "epoch": 0.6956521739130435, + "flos": 519255553536.0, + "grad_norm": 0.06271109335257424, + "language_loss": 0.82397133, + "learning_rate": 0.00022386843762252023, + "loss": 0.83460569, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.29418945, + "step": 3616, + "time_per_iteration": 2.6123175621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106886, + "balance_loss_mlp": 1.03781807, + "epoch": 0.695844555598307, + "flos": 466029365760.0, + "grad_norm": 0.06387852157141136, + "language_loss": 0.79405069, + "learning_rate": 0.00022360876780625193, + "loss": 0.8047393, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.31030273, + "step": 3617, + "time_per_iteration": 2.548015832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_mlp": 1.03798532, + "epoch": 0.6960369372835706, + "flos": 600347510784.0, + "grad_norm": 0.0476690799196669, + "language_loss": 0.7988438, + "learning_rate": 0.00022334920529457604, + "loss": 0.80952054, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.296875, + "step": 3618, + "time_per_iteration": 2.899250030517578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.0357945, + "epoch": 0.6962293189688342, + "flos": 643927186944.0, + "grad_norm": 0.054798101167174096, + "language_loss": 0.87429041, + "learning_rate": 0.00022308975018826423, + "loss": 0.88495374, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.30517578, + "step": 3619, + "time_per_iteration": 2.96332049369812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04016924, + "epoch": 0.6964217006540977, + "flos": 638524864512.0, + "grad_norm": 0.06421164682139191, + "language_loss": 0.85025704, + "learning_rate": 0.00022283040258804564, + "loss": 0.86095744, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.29858398, + "step": 3620, + "time_per_iteration": 2.7818944454193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067101, + "balance_loss_mlp": 1.03703606, + "epoch": 0.6966140823393613, + "flos": 651864125952.0, + "grad_norm": 0.06644285191513807, + "language_loss": 0.83246511, + "learning_rate": 0.00022257116259460802, + "loss": 0.84313607, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.30004883, + "step": 3621, + "time_per_iteration": 2.870532989501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068386, + "balance_loss_mlp": 1.03901291, + "epoch": 0.6968064640246249, + "flos": 704160552960.0, + "grad_norm": 0.06921875901681852, + "language_loss": 0.81326395, + "learning_rate": 0.00022231203030859725, + "loss": 0.82394779, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.29321289, + "step": 3622, + "time_per_iteration": 2.980616807937622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069183, + "balance_loss_mlp": 1.03923714, + "epoch": 0.6969988457098885, + "flos": 492312190464.0, + "grad_norm": 0.06079999883636956, + "language_loss": 0.83173907, + "learning_rate": 0.00022205300583061737, + "loss": 0.84243095, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.29882812, + "step": 3623, + "time_per_iteration": 2.579345226287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_mlp": 1.02855718, + "epoch": 0.6971912273951519, + "flos": 1351839974400.0, + "grad_norm": 0.01990235236243219, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83878684, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.11914062, + "step": 3624, + "time_per_iteration": 4.92698335647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106745, + "balance_loss_mlp": 1.03705204, + "epoch": 0.6973836090804155, + "flos": 602182301184.0, + "grad_norm": 0.06709425474580019, + "language_loss": 0.77051836, + "learning_rate": 0.00022153528070095735, + "loss": 0.7811929, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.3034668, + "step": 3625, + "time_per_iteration": 2.732236385345459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072165, + "balance_loss_mlp": 1.04262519, + "epoch": 0.6975759907656791, + "flos": 523805280768.0, + "grad_norm": 0.06819853082306866, + "language_loss": 0.88156587, + "learning_rate": 0.00022127658025027568, + "loss": 0.89228755, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.29516602, + "step": 3626, + "time_per_iteration": 2.6894659996032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_mlp": 1.04275477, + "epoch": 0.6977683724509427, + "flos": 480672327168.0, + "grad_norm": 0.06462671043275556, + "language_loss": 0.84997016, + "learning_rate": 0.00022101798800962258, + "loss": 0.8606984, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.30004883, + "step": 3627, + "time_per_iteration": 2.578765392303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067981, + "balance_loss_mlp": 1.03732049, + "epoch": 0.6979607541362063, + "flos": 522372132864.0, + "grad_norm": 0.07388726632037217, + "language_loss": 0.7899543, + "learning_rate": 0.00022075950407939227, + "loss": 0.80063409, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.30639648, + "step": 3628, + "time_per_iteration": 2.615227699279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.04519582, + "epoch": 0.6981531358214698, + "flos": 547818329088.0, + "grad_norm": 0.07136749331855524, + "language_loss": 0.82724559, + "learning_rate": 0.0002205011285599367, + "loss": 0.83798957, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.29150391, + "step": 3629, + "time_per_iteration": 2.623537063598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068631, + "balance_loss_mlp": 1.0383997, + "epoch": 0.6983455175067333, + "flos": 699747628032.0, + "grad_norm": 0.053682643938984226, + "language_loss": 0.80428958, + "learning_rate": 0.00022024286155156658, + "loss": 0.81497598, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.30224609, + "step": 3630, + "time_per_iteration": 2.8577961921691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074555, + "balance_loss_mlp": 1.04472852, + "epoch": 0.6985378991919969, + "flos": 484819001856.0, + "grad_norm": 0.05341661710184385, + "language_loss": 0.85616398, + "learning_rate": 0.00021998470315454994, + "loss": 0.8669095, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.2980957, + "step": 3631, + "time_per_iteration": 2.6452653408050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068304, + "balance_loss_mlp": 1.03902662, + "epoch": 0.6987302808772605, + "flos": 558503700480.0, + "grad_norm": 0.06182978984642289, + "language_loss": 0.86509019, + "learning_rate": 0.00021972665346911275, + "loss": 0.87577331, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.29296875, + "step": 3632, + "time_per_iteration": 2.7207632064819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073072, + "balance_loss_mlp": 1.04400849, + "epoch": 0.698922662562524, + "flos": 483350948352.0, + "grad_norm": 0.05617398494873169, + "language_loss": 0.79707497, + "learning_rate": 0.00021946871259543877, + "loss": 0.80780566, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.29052734, + "step": 3633, + "time_per_iteration": 2.574397325515747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073801, + "balance_loss_mlp": 1.04488051, + "epoch": 0.6991150442477876, + "flos": 718586726400.0, + "grad_norm": 0.05654795894092567, + "language_loss": 0.83115089, + "learning_rate": 0.00021921088063366957, + "loss": 0.8418889, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.28930664, + "step": 3634, + "time_per_iteration": 2.9441816806793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_mlp": 1.04452109, + "epoch": 0.6993074259330512, + "flos": 488871134208.0, + "grad_norm": 0.05955924970323312, + "language_loss": 0.8162455, + "learning_rate": 0.00021895315768390435, + "loss": 0.82697725, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.28662109, + "step": 3635, + "time_per_iteration": 2.62445068359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107784, + "balance_loss_mlp": 1.04932475, + "epoch": 0.6994998076183148, + "flos": 717745715712.0, + "grad_norm": 0.054016227636185014, + "language_loss": 0.88036686, + "learning_rate": 0.00021869554384619999, + "loss": 0.89114523, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.28491211, + "step": 3636, + "time_per_iteration": 3.0029518604278564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107865, + "balance_loss_mlp": 1.05037308, + "epoch": 0.6996921893035783, + "flos": 578730866688.0, + "grad_norm": 0.06391776997203466, + "language_loss": 0.80659258, + "learning_rate": 0.00021843803922057115, + "loss": 0.81737912, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.28295898, + "step": 3637, + "time_per_iteration": 2.7211790084838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107883, + "balance_loss_mlp": 1.05110145, + "epoch": 0.6998845709888418, + "flos": 518369462784.0, + "grad_norm": 0.0662212795858457, + "language_loss": 0.81642038, + "learning_rate": 0.00021818064390698977, + "loss": 0.82720864, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.27758789, + "step": 3638, + "time_per_iteration": 2.5884149074554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081934, + "balance_loss_mlp": 1.05303788, + "epoch": 0.7000769526741054, + "flos": 620666399232.0, + "grad_norm": 0.06374773426861974, + "language_loss": 0.86868232, + "learning_rate": 0.0002179233580053861, + "loss": 0.8795017, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.2890625, + "step": 3639, + "time_per_iteration": 2.753732681274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076492, + "balance_loss_mlp": 1.04776227, + "epoch": 0.700269334359369, + "flos": 559670598144.0, + "grad_norm": 0.059265612347706345, + "language_loss": 0.85829276, + "learning_rate": 0.00021766618161564688, + "loss": 0.86905766, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.28710938, + "step": 3640, + "time_per_iteration": 2.7745206356048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084226, + "balance_loss_mlp": 1.05575871, + "epoch": 0.7004617160446326, + "flos": 483090490368.0, + "grad_norm": 0.15690200420977896, + "language_loss": 0.87115562, + "learning_rate": 0.00021740911483761677, + "loss": 0.88199788, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.28417969, + "step": 3641, + "time_per_iteration": 2.563645362854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080559, + "balance_loss_mlp": 1.05292678, + "epoch": 0.7006540977298961, + "flos": 696647015424.0, + "grad_norm": 0.051778810892446146, + "language_loss": 0.92034602, + "learning_rate": 0.00021715215777109837, + "loss": 0.93115163, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.27685547, + "step": 3642, + "time_per_iteration": 2.9448609352111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082689, + "balance_loss_mlp": 1.05481815, + "epoch": 0.7008464794151597, + "flos": 504528224256.0, + "grad_norm": 0.0649670876424198, + "language_loss": 0.84332794, + "learning_rate": 0.00021689531051585103, + "loss": 0.85415483, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.27905273, + "step": 3643, + "time_per_iteration": 2.5947420597076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080325, + "balance_loss_mlp": 1.05185759, + "epoch": 0.7010388611004232, + "flos": 536985980928.0, + "grad_norm": 0.05881899099988506, + "language_loss": 0.80633974, + "learning_rate": 0.00021663857317159196, + "loss": 0.81714302, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.28466797, + "step": 3644, + "time_per_iteration": 2.6077582836151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084664, + "balance_loss_mlp": 1.0568645, + "epoch": 0.7012312427856868, + "flos": 546996257280.0, + "grad_norm": 0.05176536936587348, + "language_loss": 0.81858003, + "learning_rate": 0.00021638194583799487, + "loss": 0.82942665, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.27832031, + "step": 3645, + "time_per_iteration": 2.661813735961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081277, + "balance_loss_mlp": 1.05335796, + "epoch": 0.7014236244709504, + "flos": 941020125696.0, + "grad_norm": 0.06125341159179279, + "language_loss": 0.82837009, + "learning_rate": 0.00021612542861469176, + "loss": 0.83918285, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.27954102, + "step": 3646, + "time_per_iteration": 3.218862771987915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086908, + "balance_loss_mlp": 1.05860782, + "epoch": 0.7016160061562139, + "flos": 524908159488.0, + "grad_norm": 0.06205257588419687, + "language_loss": 0.82430637, + "learning_rate": 0.00021586902160127135, + "loss": 0.83517551, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.28271484, + "step": 3647, + "time_per_iteration": 2.5945966243743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087682, + "balance_loss_mlp": 1.05938208, + "epoch": 0.7018083878414775, + "flos": 373170917376.0, + "grad_norm": 0.07384041678105348, + "language_loss": 0.74226022, + "learning_rate": 0.00021561272489727974, + "loss": 0.75313699, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.28320312, + "step": 3648, + "time_per_iteration": 2.423347234725952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.06241107, + "epoch": 0.7020007695267411, + "flos": 527522761728.0, + "grad_norm": 0.0540045704658738, + "language_loss": 0.80522048, + "learning_rate": 0.0002153565386022199, + "loss": 0.8161214, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.27734375, + "step": 3649, + "time_per_iteration": 2.634904623031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089135, + "balance_loss_mlp": 1.06112039, + "epoch": 0.7021931512120047, + "flos": 689850832896.0, + "grad_norm": 0.1599503630973746, + "language_loss": 0.8250525, + "learning_rate": 0.00021510046281555262, + "loss": 0.83594382, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.28027344, + "step": 3650, + "time_per_iteration": 2.824385643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087214, + "balance_loss_mlp": 1.05922353, + "epoch": 0.7023855328972681, + "flos": 639499705344.0, + "grad_norm": 0.06982952600277435, + "language_loss": 0.81099337, + "learning_rate": 0.0002148444976366949, + "loss": 0.82186544, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.27978516, + "step": 3651, + "time_per_iteration": 2.7480077743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090659, + "balance_loss_mlp": 1.06297851, + "epoch": 0.7025779145825317, + "flos": 560674552320.0, + "grad_norm": 0.06340286287585739, + "language_loss": 0.82626015, + "learning_rate": 0.00021458864316502136, + "loss": 0.83716673, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.27734375, + "step": 3652, + "time_per_iteration": 2.699397087097168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085576, + "balance_loss_mlp": 1.0581578, + "epoch": 0.7027702962677953, + "flos": 447214998528.0, + "grad_norm": 0.06356802688225487, + "language_loss": 0.87087834, + "learning_rate": 0.0002143328994998634, + "loss": 0.88173407, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.2746582, + "step": 3653, + "time_per_iteration": 2.4910500049591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108223, + "balance_loss_mlp": 1.05347681, + "epoch": 0.7029626779530589, + "flos": 622198471680.0, + "grad_norm": 0.1133092603860293, + "language_loss": 0.78451055, + "learning_rate": 0.00021407726674050982, + "loss": 0.79533285, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.28735352, + "step": 3654, + "time_per_iteration": 2.8789288997650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086422, + "balance_loss_mlp": 1.0578599, + "epoch": 0.7031550596383225, + "flos": 629307546624.0, + "grad_norm": 0.054147023301355804, + "language_loss": 0.86789209, + "learning_rate": 0.0002138217449862061, + "loss": 0.87875628, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.28540039, + "step": 3655, + "time_per_iteration": 2.7385337352752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108677, + "balance_loss_mlp": 1.05932784, + "epoch": 0.703347441323586, + "flos": 530589878784.0, + "grad_norm": 0.06738898601128132, + "language_loss": 0.78017962, + "learning_rate": 0.00021356633433615403, + "loss": 0.79104733, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.2746582, + "step": 3656, + "time_per_iteration": 2.5828328132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086039, + "balance_loss_mlp": 1.05778599, + "epoch": 0.7035398230088495, + "flos": 693264185856.0, + "grad_norm": 0.05385272242156959, + "language_loss": 0.83434522, + "learning_rate": 0.0002133110348895133, + "loss": 0.84520566, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.28271484, + "step": 3657, + "time_per_iteration": 2.978156805038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081393, + "balance_loss_mlp": 1.05316448, + "epoch": 0.7037322046941131, + "flos": 967628837376.0, + "grad_norm": 0.05837559854624073, + "language_loss": 0.84898746, + "learning_rate": 0.0002130558467453999, + "loss": 0.85980141, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.28198242, + "step": 3658, + "time_per_iteration": 3.3442087173461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087911, + "balance_loss_mlp": 1.05875289, + "epoch": 0.7039245863793767, + "flos": 502598891520.0, + "grad_norm": 0.19942638133943547, + "language_loss": 0.84606349, + "learning_rate": 0.0002128007700028865, + "loss": 0.85694265, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.29125977, + "step": 3659, + "time_per_iteration": 2.742828607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088765, + "balance_loss_mlp": 1.06072712, + "epoch": 0.7041169680646402, + "flos": 465709271040.0, + "grad_norm": 0.06314927243304276, + "language_loss": 0.84402716, + "learning_rate": 0.00021254580476100276, + "loss": 0.85491478, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.28051758, + "step": 3660, + "time_per_iteration": 2.565272569656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087079, + "balance_loss_mlp": 1.0595659, + "epoch": 0.7043093497499038, + "flos": 631897417728.0, + "grad_norm": 0.06296941062799823, + "language_loss": 0.78639442, + "learning_rate": 0.00021229095111873497, + "loss": 0.79726517, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.27539062, + "step": 3661, + "time_per_iteration": 2.842556953430176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088789, + "balance_loss_mlp": 1.06072736, + "epoch": 0.7045017314351674, + "flos": 542639996928.0, + "grad_norm": 0.05444300541547984, + "language_loss": 0.86236918, + "learning_rate": 0.0002120362091750261, + "loss": 0.87325704, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.28100586, + "step": 3662, + "time_per_iteration": 2.810499668121338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05518591, + "epoch": 0.704694113120431, + "flos": 428012135424.0, + "grad_norm": 0.0593931077751887, + "language_loss": 0.86978149, + "learning_rate": 0.00021178157902877566, + "loss": 0.88061064, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.27758789, + "step": 3663, + "time_per_iteration": 2.4574224948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092262, + "balance_loss_mlp": 1.06415284, + "epoch": 0.7048864948056945, + "flos": 650253477888.0, + "grad_norm": 0.0751363020635885, + "language_loss": 0.86745709, + "learning_rate": 0.0002115270607788397, + "loss": 0.87837976, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.28125, + "step": 3664, + "time_per_iteration": 2.7495899200439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087732, + "balance_loss_mlp": 1.05981338, + "epoch": 0.705078876490958, + "flos": 412330314240.0, + "grad_norm": 0.07034018625942835, + "language_loss": 0.85685182, + "learning_rate": 0.00021127265452403133, + "loss": 0.86772919, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.27954102, + "step": 3665, + "time_per_iteration": 2.5029428005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_mlp": 1.03472269, + "epoch": 0.7052712581762216, + "flos": 1419266783232.0, + "grad_norm": 0.01645523461712921, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85138083, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.1171875, + "step": 3666, + "time_per_iteration": 4.882653474807739 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.05729461, + "epoch": 0.7054636398614852, + "flos": 492795228672.0, + "grad_norm": 0.05492799595906871, + "language_loss": 0.82834661, + "learning_rate": 0.00021076417839483065, + "loss": 0.83919299, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.27392578, + "step": 3667, + "time_per_iteration": 2.8046011924743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084673, + "balance_loss_mlp": 1.05622983, + "epoch": 0.7056560215467488, + "flos": 450228271104.0, + "grad_norm": 0.057239687513416834, + "language_loss": 0.84952044, + "learning_rate": 0.00021051010871784589, + "loss": 0.86036718, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.28442383, + "step": 3668, + "time_per_iteration": 2.547053098678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084529, + "balance_loss_mlp": 1.05634761, + "epoch": 0.7058484032320124, + "flos": 565426510848.0, + "grad_norm": 0.050223334888513216, + "language_loss": 0.78893518, + "learning_rate": 0.0002102561514308045, + "loss": 0.79978049, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.28173828, + "step": 3669, + "time_per_iteration": 2.752600908279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081831, + "balance_loss_mlp": 1.05446088, + "epoch": 0.7060407849172758, + "flos": 566736003072.0, + "grad_norm": 0.06177474978046869, + "language_loss": 0.82231724, + "learning_rate": 0.00021000230663230135, + "loss": 0.8331356, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.27441406, + "step": 3670, + "time_per_iteration": 2.7295479774475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_mlp": 1.05213535, + "epoch": 0.7062331666025394, + "flos": 468505755648.0, + "grad_norm": 0.06597526409708185, + "language_loss": 0.82935393, + "learning_rate": 0.00020974857442088762, + "loss": 0.84015119, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.27612305, + "step": 3671, + "time_per_iteration": 2.6223764419555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.05999768, + "epoch": 0.706425548287803, + "flos": 595042702848.0, + "grad_norm": 0.061832347037407955, + "language_loss": 0.88995802, + "learning_rate": 0.00020949495489507104, + "loss": 0.90083718, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.27954102, + "step": 3672, + "time_per_iteration": 2.6759605407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.0569576, + "epoch": 0.7066179299730666, + "flos": 475566778368.0, + "grad_norm": 0.08160392795168159, + "language_loss": 0.84611428, + "learning_rate": 0.00020924144815331525, + "loss": 0.85695612, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.27270508, + "step": 3673, + "time_per_iteration": 2.5533270835876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087334, + "balance_loss_mlp": 1.05991554, + "epoch": 0.7068103116583301, + "flos": 506153428992.0, + "grad_norm": 0.06771134911837604, + "language_loss": 0.8321439, + "learning_rate": 0.00020898805429404044, + "loss": 0.84301728, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.2746582, + "step": 3674, + "time_per_iteration": 2.6267168521881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086456, + "balance_loss_mlp": 1.05860853, + "epoch": 0.7070026933435937, + "flos": 679028659200.0, + "grad_norm": 0.074333129961205, + "language_loss": 0.78350407, + "learning_rate": 0.0002087347734156228, + "loss": 0.79436862, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.27880859, + "step": 3675, + "time_per_iteration": 2.879998207092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081334, + "balance_loss_mlp": 1.05415416, + "epoch": 0.7071950750288573, + "flos": 471981717504.0, + "grad_norm": 0.05100324832046891, + "language_loss": 0.79745239, + "learning_rate": 0.00020848160561639452, + "loss": 0.80826575, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.2722168, + "step": 3676, + "time_per_iteration": 2.6603164672851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084996, + "balance_loss_mlp": 1.05733955, + "epoch": 0.7073874567141208, + "flos": 473507997696.0, + "grad_norm": 0.054459225189570165, + "language_loss": 0.85905212, + "learning_rate": 0.0002082285509946445, + "loss": 0.86990213, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.27685547, + "step": 3677, + "time_per_iteration": 2.553056240081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084321, + "balance_loss_mlp": 1.05664098, + "epoch": 0.7075798383993844, + "flos": 545589250560.0, + "grad_norm": 0.062290106460759526, + "language_loss": 0.83324182, + "learning_rate": 0.00020797560964861683, + "loss": 0.84408498, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.27709961, + "step": 3678, + "time_per_iteration": 2.792145013809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087022, + "balance_loss_mlp": 1.05907917, + "epoch": 0.7077722200846479, + "flos": 661766713344.0, + "grad_norm": 0.06608494347958908, + "language_loss": 0.806409, + "learning_rate": 0.0002077227816765122, + "loss": 0.81727922, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.27954102, + "step": 3679, + "time_per_iteration": 4.414989709854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_mlp": 1.03525627, + "epoch": 0.7079646017699115, + "flos": 1529128129536.0, + "grad_norm": 0.01304969035368713, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77495277, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.12255859, + "step": 3680, + "time_per_iteration": 4.77666163444519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082723, + "balance_loss_mlp": 1.05544841, + "epoch": 0.7081569834551751, + "flos": 621217838592.0, + "grad_norm": 0.07037612396181211, + "language_loss": 0.7852788, + "learning_rate": 0.00020721746624665383, + "loss": 0.7961061, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.27319336, + "step": 3681, + "time_per_iteration": 2.7164971828460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081164, + "balance_loss_mlp": 1.05338836, + "epoch": 0.7083493651404387, + "flos": 794280743424.0, + "grad_norm": 0.047491060798417466, + "language_loss": 0.80214369, + "learning_rate": 0.00020696497898508114, + "loss": 0.81295532, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.27807617, + "step": 3682, + "time_per_iteration": 3.0300755500793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089218, + "balance_loss_mlp": 1.06165683, + "epoch": 0.7085417468257021, + "flos": 813394856448.0, + "grad_norm": 0.37225594130432843, + "language_loss": 0.77676904, + "learning_rate": 0.00020671260548979316, + "loss": 0.78766119, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.27587891, + "step": 3683, + "time_per_iteration": 3.0000338554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108452, + "balance_loss_mlp": 1.05715001, + "epoch": 0.7087341285109657, + "flos": 700259779584.0, + "grad_norm": 0.05966278900445413, + "language_loss": 0.84945965, + "learning_rate": 0.00020646034585876982, + "loss": 0.86030483, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.27441406, + "step": 3684, + "time_per_iteration": 2.8507392406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.05243671, + "epoch": 0.7089265101962293, + "flos": 596211010560.0, + "grad_norm": 0.050873107987967195, + "language_loss": 0.84335744, + "learning_rate": 0.00020620820018994718, + "loss": 0.85416293, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.28125, + "step": 3685, + "time_per_iteration": 2.8229713439941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082628, + "balance_loss_mlp": 1.05385077, + "epoch": 0.7091188918814929, + "flos": 486842876928.0, + "grad_norm": 0.07162313361599233, + "language_loss": 0.82926023, + "learning_rate": 0.00020595616858121675, + "loss": 0.84008658, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.2878418, + "step": 3686, + "time_per_iteration": 2.694638967514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079578, + "balance_loss_mlp": 1.05158722, + "epoch": 0.7093112735667565, + "flos": 599833949184.0, + "grad_norm": 0.06190114046391337, + "language_loss": 0.80535042, + "learning_rate": 0.00020570425113042586, + "loss": 0.81614614, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.28027344, + "step": 3687, + "time_per_iteration": 2.7041516304016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079239, + "balance_loss_mlp": 1.05074835, + "epoch": 0.70950365525202, + "flos": 505577258496.0, + "grad_norm": 0.06733246833768769, + "language_loss": 0.85552853, + "learning_rate": 0.0002054524479353776, + "loss": 0.86632097, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.28540039, + "step": 3688, + "time_per_iteration": 2.6622695922851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079477, + "balance_loss_mlp": 1.05122447, + "epoch": 0.7096960369372836, + "flos": 731846002176.0, + "grad_norm": 0.09171480616774523, + "language_loss": 0.81669426, + "learning_rate": 0.00020520075909383063, + "loss": 0.82748902, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.28271484, + "step": 3689, + "time_per_iteration": 2.885802745819092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085524, + "balance_loss_mlp": 1.05684257, + "epoch": 0.7098884186225471, + "flos": 971685351936.0, + "grad_norm": 0.058367776122323904, + "language_loss": 0.80585086, + "learning_rate": 0.00020494918470349916, + "loss": 0.81670618, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.28662109, + "step": 3690, + "time_per_iteration": 3.297044038772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078519, + "balance_loss_mlp": 1.05038536, + "epoch": 0.7100808003078107, + "flos": 504001516032.0, + "grad_norm": 0.0682429606540151, + "language_loss": 0.85554057, + "learning_rate": 0.00020469772486205297, + "loss": 0.8663258, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.28149414, + "step": 3691, + "time_per_iteration": 2.602031707763672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082342, + "balance_loss_mlp": 1.05354142, + "epoch": 0.7102731819930742, + "flos": 540073446912.0, + "grad_norm": 0.05487079427914329, + "language_loss": 0.81415904, + "learning_rate": 0.0002044463796671177, + "loss": 0.82498252, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.2878418, + "step": 3692, + "time_per_iteration": 2.665280342102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086192, + "balance_loss_mlp": 1.05724823, + "epoch": 0.7104655636783378, + "flos": 620066907648.0, + "grad_norm": 0.06500857460791332, + "language_loss": 0.80369031, + "learning_rate": 0.00020419514921627408, + "loss": 0.81455219, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.28930664, + "step": 3693, + "time_per_iteration": 2.83823299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080501, + "balance_loss_mlp": 1.05251122, + "epoch": 0.7106579453636014, + "flos": 557060378112.0, + "grad_norm": 0.05808556039270617, + "language_loss": 0.77408904, + "learning_rate": 0.00020394403360705855, + "loss": 0.78489405, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.2800293, + "step": 3694, + "time_per_iteration": 2.6939644813537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085807, + "balance_loss_mlp": 1.05569434, + "epoch": 0.710850327048865, + "flos": 512795432448.0, + "grad_norm": 0.06287788377881579, + "language_loss": 0.87703514, + "learning_rate": 0.00020369303293696228, + "loss": 0.88789326, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.30078125, + "step": 3695, + "time_per_iteration": 2.588268995285034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083208, + "balance_loss_mlp": 1.05474114, + "epoch": 0.7110427087341286, + "flos": 423398389248.0, + "grad_norm": 0.06448607356035771, + "language_loss": 0.78199911, + "learning_rate": 0.00020344214730343304, + "loss": 0.79283124, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.28466797, + "step": 3696, + "time_per_iteration": 2.6181139945983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073393, + "balance_loss_mlp": 1.04511678, + "epoch": 0.711235090419392, + "flos": 577107072000.0, + "grad_norm": 0.05437568169477665, + "language_loss": 0.79383552, + "learning_rate": 0.00020319137680387296, + "loss": 0.80456948, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.28271484, + "step": 3697, + "time_per_iteration": 2.925847291946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077248, + "balance_loss_mlp": 1.04844677, + "epoch": 0.7114274721046556, + "flos": 447830456832.0, + "grad_norm": 0.07105325547979466, + "language_loss": 0.80237764, + "learning_rate": 0.0002029407215356398, + "loss": 0.81315017, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.28808594, + "step": 3698, + "time_per_iteration": 3.9760594367980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077498, + "balance_loss_mlp": 1.04829144, + "epoch": 0.7116198537899192, + "flos": 621680527872.0, + "grad_norm": 0.06046542117195041, + "language_loss": 0.82863748, + "learning_rate": 0.00020269018159604663, + "loss": 0.83941245, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.29150391, + "step": 3699, + "time_per_iteration": 2.704861640930176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071741, + "balance_loss_mlp": 1.04336905, + "epoch": 0.7118122354751828, + "flos": 498476947968.0, + "grad_norm": 0.053095463302870675, + "language_loss": 0.818941, + "learning_rate": 0.00020243975708236162, + "loss": 0.82965839, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.28393555, + "step": 3700, + "time_per_iteration": 2.6019287109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010692, + "balance_loss_mlp": 1.0402801, + "epoch": 0.7120046171604463, + "flos": 572438071296.0, + "grad_norm": 0.06895358170102628, + "language_loss": 0.86096191, + "learning_rate": 0.00020218944809180818, + "loss": 0.87165391, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.2890625, + "step": 3701, + "time_per_iteration": 2.69789719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066667, + "balance_loss_mlp": 1.0383426, + "epoch": 0.7121969988457099, + "flos": 572388609024.0, + "grad_norm": 0.048938239682891294, + "language_loss": 0.84783876, + "learning_rate": 0.00020193925472156493, + "loss": 0.85850537, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.28320312, + "step": 3702, + "time_per_iteration": 2.7333877086639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_mlp": 1.04036713, + "epoch": 0.7123893805309734, + "flos": 1522585050624.0, + "grad_norm": 0.026752885046143426, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75342035, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.125, + "step": 3703, + "time_per_iteration": 4.899750232696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066879, + "balance_loss_mlp": 1.0373385, + "epoch": 0.712581762216237, + "flos": 614779476480.0, + "grad_norm": 0.05613195068078556, + "language_loss": 0.83530253, + "learning_rate": 0.00020143921523049863, + "loss": 0.84597135, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.29467773, + "step": 3704, + "time_per_iteration": 2.9570298194885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067522, + "balance_loss_mlp": 1.03860188, + "epoch": 0.7127741439015006, + "flos": 597504536064.0, + "grad_norm": 0.05853421015843179, + "language_loss": 0.83969504, + "learning_rate": 0.00020118936930380837, + "loss": 0.85037029, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.2890625, + "step": 3705, + "time_per_iteration": 2.750566005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068067, + "balance_loss_mlp": 1.03876543, + "epoch": 0.7129665255867641, + "flos": 537138749952.0, + "grad_norm": 0.07045372312262692, + "language_loss": 0.80809951, + "learning_rate": 0.0002009396393856932, + "loss": 0.81878018, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.29272461, + "step": 3706, + "time_per_iteration": 2.6755757331848145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106429, + "balance_loss_mlp": 1.03560829, + "epoch": 0.7131589072720277, + "flos": 526173981696.0, + "grad_norm": 0.06196520847148758, + "language_loss": 0.82349885, + "learning_rate": 0.00020069002557310673, + "loss": 0.83414185, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.28662109, + "step": 3707, + "time_per_iteration": 2.737092971801758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_mlp": 1.03734505, + "epoch": 0.7133512889572913, + "flos": 530626194432.0, + "grad_norm": 0.06289073454443639, + "language_loss": 0.77148253, + "learning_rate": 0.00020044052796295807, + "loss": 0.78213924, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.28320312, + "step": 3708, + "time_per_iteration": 2.858578681945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066902, + "balance_loss_mlp": 1.03783917, + "epoch": 0.7135436706425549, + "flos": 503282750976.0, + "grad_norm": 0.05709228954993964, + "language_loss": 0.8160665, + "learning_rate": 0.00020019114665211063, + "loss": 0.8267355, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.29052734, + "step": 3709, + "time_per_iteration": 2.6008872985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070493, + "balance_loss_mlp": 1.04128647, + "epoch": 0.7137360523278183, + "flos": 515719954944.0, + "grad_norm": 0.05827837383265674, + "language_loss": 0.81244481, + "learning_rate": 0.00019994188173738276, + "loss": 0.82314974, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.29174805, + "step": 3710, + "time_per_iteration": 2.6042001247406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068317, + "balance_loss_mlp": 1.03861022, + "epoch": 0.7139284340130819, + "flos": 510103664640.0, + "grad_norm": 0.056315014070009634, + "language_loss": 0.80933827, + "learning_rate": 0.0001996927333155477, + "loss": 0.82002145, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.29663086, + "step": 3711, + "time_per_iteration": 2.748624086380005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010683, + "balance_loss_mlp": 1.03947508, + "epoch": 0.7141208156983455, + "flos": 889896388608.0, + "grad_norm": 0.061443099278046684, + "language_loss": 0.85405827, + "learning_rate": 0.00019944370148333346, + "loss": 0.86474121, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.2878418, + "step": 3712, + "time_per_iteration": 3.1557986736297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072206, + "balance_loss_mlp": 1.04316652, + "epoch": 0.7143131973836091, + "flos": 535504780800.0, + "grad_norm": 0.048833627959222234, + "language_loss": 0.79702485, + "learning_rate": 0.00019919478633742278, + "loss": 0.80774689, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.29052734, + "step": 3713, + "time_per_iteration": 2.667795419692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071876, + "balance_loss_mlp": 1.04252636, + "epoch": 0.7145055790688727, + "flos": 473429422080.0, + "grad_norm": 0.0703082286681538, + "language_loss": 0.85178196, + "learning_rate": 0.00019894598797445302, + "loss": 0.86250067, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.29345703, + "step": 3714, + "time_per_iteration": 2.5345022678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107178, + "balance_loss_mlp": 1.04333699, + "epoch": 0.7146979607541362, + "flos": 570227931648.0, + "grad_norm": 0.05625862990353456, + "language_loss": 0.8199116, + "learning_rate": 0.00019869730649101615, + "loss": 0.83062935, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.28417969, + "step": 3715, + "time_per_iteration": 2.8149824142456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079135, + "balance_loss_mlp": 1.04988086, + "epoch": 0.7148903424393998, + "flos": 839299359744.0, + "grad_norm": 0.071816789410327, + "language_loss": 0.72405577, + "learning_rate": 0.00019844874198365943, + "loss": 0.73484713, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.29199219, + "step": 3716, + "time_per_iteration": 3.0852138996124268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069692, + "balance_loss_mlp": 1.04070067, + "epoch": 0.7150827241246633, + "flos": 541560439296.0, + "grad_norm": 0.05756859715120925, + "language_loss": 0.83796489, + "learning_rate": 0.00019820029454888362, + "loss": 0.84866184, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.28979492, + "step": 3717, + "time_per_iteration": 2.7309763431549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_mlp": 1.01916921, + "epoch": 0.7152751058099269, + "flos": 1582803859968.0, + "grad_norm": 0.017203742332568887, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75552928, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.125, + "step": 3718, + "time_per_iteration": 5.044423580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_mlp": 1.04777932, + "epoch": 0.7154674874951905, + "flos": 517167659520.0, + "grad_norm": 0.056277438983796696, + "language_loss": 0.79924434, + "learning_rate": 0.0001977037512828529, + "loss": 0.81001997, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.29760742, + "step": 3719, + "time_per_iteration": 2.5888805389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069135, + "balance_loss_mlp": 1.04059625, + "epoch": 0.715659869180454, + "flos": 602246320128.0, + "grad_norm": 0.0550224121073684, + "language_loss": 0.86091673, + "learning_rate": 0.0001974556556443734, + "loss": 0.87160814, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.28540039, + "step": 3720, + "time_per_iteration": 2.7241830825805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074341, + "balance_loss_mlp": 1.04575443, + "epoch": 0.7158522508657176, + "flos": 531403186176.0, + "grad_norm": 0.06173575943164377, + "language_loss": 0.88796955, + "learning_rate": 0.00019720767746402547, + "loss": 0.89871293, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.28564453, + "step": 3721, + "time_per_iteration": 2.721775770187378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075436, + "balance_loss_mlp": 1.04725444, + "epoch": 0.7160446325509812, + "flos": 557301897216.0, + "grad_norm": 0.08488248506445442, + "language_loss": 0.79925454, + "learning_rate": 0.00019695981683808222, + "loss": 0.81000888, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.28173828, + "step": 3722, + "time_per_iteration": 2.7333226203918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077529, + "balance_loss_mlp": 1.04989624, + "epoch": 0.7162370142362448, + "flos": 690664140288.0, + "grad_norm": 0.055390897958499746, + "language_loss": 0.85177088, + "learning_rate": 0.00019671207386277225, + "loss": 0.86254621, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.27636719, + "step": 3723, + "time_per_iteration": 2.924482583999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076762, + "balance_loss_mlp": 1.04800856, + "epoch": 0.7164293959215082, + "flos": 793772974080.0, + "grad_norm": 0.06210467424192018, + "language_loss": 0.78391171, + "learning_rate": 0.0001964644486342777, + "loss": 0.79467928, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.28735352, + "step": 3724, + "time_per_iteration": 2.958444833755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077347, + "balance_loss_mlp": 1.04926085, + "epoch": 0.7166217776067718, + "flos": 493922838528.0, + "grad_norm": 0.0530875998345761, + "language_loss": 0.86440647, + "learning_rate": 0.00019621694124873524, + "loss": 0.87518001, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.28125, + "step": 3725, + "time_per_iteration": 2.6775362491607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_mlp": 1.02366674, + "epoch": 0.7168141592920354, + "flos": 1400337524736.0, + "grad_norm": 0.0197496536520254, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77576053, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.12255859, + "step": 3726, + "time_per_iteration": 4.876794338226318 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079559, + "balance_loss_mlp": 1.05085373, + "epoch": 0.717006540977299, + "flos": 792789368832.0, + "grad_norm": 0.05459811074333738, + "language_loss": 0.77077997, + "learning_rate": 0.00019572228039082428, + "loss": 0.78157556, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.28686523, + "step": 3727, + "time_per_iteration": 3.094959020614624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078104, + "balance_loss_mlp": 1.04982781, + "epoch": 0.7171989226625626, + "flos": 554525761536.0, + "grad_norm": 0.05087577266454216, + "language_loss": 0.83556503, + "learning_rate": 0.0001954751271105002, + "loss": 0.84634602, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.28295898, + "step": 3728, + "time_per_iteration": 2.8009090423583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077296, + "balance_loss_mlp": 1.04956806, + "epoch": 0.717391304347826, + "flos": 555628640256.0, + "grad_norm": 0.058127871838067766, + "language_loss": 0.80794644, + "learning_rate": 0.00019522809205721687, + "loss": 0.81871945, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.27758789, + "step": 3729, + "time_per_iteration": 2.7567226886749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070359, + "balance_loss_mlp": 1.0422256, + "epoch": 0.7175836860330896, + "flos": 538582072320.0, + "grad_norm": 0.06552906350513053, + "language_loss": 0.82629025, + "learning_rate": 0.0001949811753268816, + "loss": 0.83699387, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.28149414, + "step": 3730, + "time_per_iteration": 2.7015092372894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074245, + "balance_loss_mlp": 1.04594445, + "epoch": 0.7177760677183532, + "flos": 515385303552.0, + "grad_norm": 0.0651237840260159, + "language_loss": 0.82088923, + "learning_rate": 0.00019473437701535634, + "loss": 0.83163166, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.28295898, + "step": 3731, + "time_per_iteration": 2.5865840911865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04425454, + "epoch": 0.7179684494036168, + "flos": 674414913024.0, + "grad_norm": 0.05867613657807477, + "language_loss": 0.89630008, + "learning_rate": 0.00019448769721845677, + "loss": 0.90702283, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.28051758, + "step": 3732, + "time_per_iteration": 2.800302743911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073645, + "balance_loss_mlp": 1.04503512, + "epoch": 0.7181608310888803, + "flos": 469672653312.0, + "grad_norm": 0.07249060183275255, + "language_loss": 0.85536152, + "learning_rate": 0.00019424113603195203, + "loss": 0.86609799, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.28662109, + "step": 3733, + "time_per_iteration": 2.5308837890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074406, + "balance_loss_mlp": 1.04589128, + "epoch": 0.7183532127741439, + "flos": 593645870592.0, + "grad_norm": 0.05588376049508018, + "language_loss": 0.80217636, + "learning_rate": 0.0001939946935515657, + "loss": 0.81292045, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.28515625, + "step": 3734, + "time_per_iteration": 2.8359925746917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077355, + "balance_loss_mlp": 1.04910207, + "epoch": 0.7185455944594075, + "flos": 498669004800.0, + "grad_norm": 0.0705810174200004, + "language_loss": 0.80242217, + "learning_rate": 0.0001937483698729755, + "loss": 0.81319571, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.28271484, + "step": 3735, + "time_per_iteration": 2.64072322845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108005, + "balance_loss_mlp": 1.05070114, + "epoch": 0.718737976144671, + "flos": 814590867456.0, + "grad_norm": 0.04976646958682061, + "language_loss": 0.81962895, + "learning_rate": 0.0001935021650918128, + "loss": 0.83042943, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.29321289, + "step": 3736, + "time_per_iteration": 3.0010826587677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010734, + "balance_loss_mlp": 1.04431319, + "epoch": 0.7189303578299346, + "flos": 438100987392.0, + "grad_norm": 0.062249035117782556, + "language_loss": 0.86910063, + "learning_rate": 0.0001932560793036625, + "loss": 0.87983465, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.29077148, + "step": 3737, + "time_per_iteration": 2.512890577316284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075707, + "balance_loss_mlp": 1.04766941, + "epoch": 0.7191227395151981, + "flos": 549137995776.0, + "grad_norm": 0.09579716691171304, + "language_loss": 0.86528683, + "learning_rate": 0.00019301011260406382, + "loss": 0.87604392, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.28051758, + "step": 3738, + "time_per_iteration": 2.624567985534668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.04897833, + "epoch": 0.7193151212004617, + "flos": 626653656576.0, + "grad_norm": 0.050336885468814714, + "language_loss": 0.79622293, + "learning_rate": 0.00019276426508850936, + "loss": 0.80699408, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.28149414, + "step": 3739, + "time_per_iteration": 2.719663619995117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074558, + "balance_loss_mlp": 1.04597163, + "epoch": 0.7195075028857253, + "flos": 740719904256.0, + "grad_norm": 0.05223198929463843, + "language_loss": 0.80390334, + "learning_rate": 0.00019251853685244564, + "loss": 0.81464887, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.28564453, + "step": 3740, + "time_per_iteration": 3.006769895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076457, + "balance_loss_mlp": 1.048967, + "epoch": 0.7196998845709889, + "flos": 802523220480.0, + "grad_norm": 0.08129460448533303, + "language_loss": 0.80554307, + "learning_rate": 0.00019227292799127283, + "loss": 0.81630766, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.27539062, + "step": 3741, + "time_per_iteration": 3.0326223373413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073379, + "balance_loss_mlp": 1.04560351, + "epoch": 0.7198922662562524, + "flos": 924786865152.0, + "grad_norm": 0.06791942956347788, + "language_loss": 0.78745782, + "learning_rate": 0.00019202743860034454, + "loss": 0.79819167, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.27807617, + "step": 3742, + "time_per_iteration": 3.2729034423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.04445601, + "epoch": 0.7200846479415159, + "flos": 579838127616.0, + "grad_norm": 0.05486250950239536, + "language_loss": 0.83459806, + "learning_rate": 0.00019178206877496873, + "loss": 0.84531891, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.27636719, + "step": 3743, + "time_per_iteration": 2.7013559341430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070767, + "balance_loss_mlp": 1.04291999, + "epoch": 0.7202770296267795, + "flos": 557410996224.0, + "grad_norm": 0.04899238240269426, + "language_loss": 0.84932864, + "learning_rate": 0.0001915368186104059, + "loss": 0.86003625, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.27880859, + "step": 3744, + "time_per_iteration": 2.726893663406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073873, + "balance_loss_mlp": 1.04621649, + "epoch": 0.7204694113120431, + "flos": 672248443392.0, + "grad_norm": 0.06348773508617375, + "language_loss": 0.80724853, + "learning_rate": 0.0001912916882018706, + "loss": 0.81798726, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.27685547, + "step": 3745, + "time_per_iteration": 2.78125262260437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073398, + "balance_loss_mlp": 1.0459559, + "epoch": 0.7206617929973067, + "flos": 798845027328.0, + "grad_norm": 0.06464144105655711, + "language_loss": 0.79121184, + "learning_rate": 0.00019104667764453125, + "loss": 0.80194581, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.2746582, + "step": 3746, + "time_per_iteration": 3.033304214477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072935, + "balance_loss_mlp": 1.04549301, + "epoch": 0.7208541746825702, + "flos": 531638913024.0, + "grad_norm": 0.050415961986803856, + "language_loss": 0.80573905, + "learning_rate": 0.00019080178703350926, + "loss": 0.81646842, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.2746582, + "step": 3747, + "time_per_iteration": 2.6518349647521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074166, + "balance_loss_mlp": 1.0458895, + "epoch": 0.7210465563678338, + "flos": 534883530240.0, + "grad_norm": 0.07572692948457345, + "language_loss": 0.83004916, + "learning_rate": 0.00019055701646387952, + "loss": 0.84079087, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.28271484, + "step": 3748, + "time_per_iteration": 2.7013447284698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_mlp": 1.01970267, + "epoch": 0.7212389380530974, + "flos": 1533076955136.0, + "grad_norm": 0.013786087553885988, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81504452, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.12060547, + "step": 3749, + "time_per_iteration": 4.794643878936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073194, + "balance_loss_mlp": 1.0453701, + "epoch": 0.7214313197383609, + "flos": 461277407232.0, + "grad_norm": 0.05812194439124776, + "language_loss": 0.86448663, + "learning_rate": 0.00019006783582886368, + "loss": 0.87521857, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.27832031, + "step": 3750, + "time_per_iteration": 2.5275614261627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075263, + "balance_loss_mlp": 1.04653358, + "epoch": 0.7216237014236244, + "flos": 1036691025408.0, + "grad_norm": 0.060767017514705764, + "language_loss": 0.82905239, + "learning_rate": 0.00018982342595339437, + "loss": 0.83980501, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.28686523, + "step": 3751, + "time_per_iteration": 3.522578239440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070907, + "balance_loss_mlp": 1.04239237, + "epoch": 0.721816083108888, + "flos": 895578107904.0, + "grad_norm": 0.05765271863237157, + "language_loss": 0.82075769, + "learning_rate": 0.00018957913649915076, + "loss": 0.83146673, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.28515625, + "step": 3752, + "time_per_iteration": 3.1765124797821045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070534, + "balance_loss_mlp": 1.04187584, + "epoch": 0.7220084647941516, + "flos": 523066166784.0, + "grad_norm": 0.07973276687690374, + "language_loss": 0.79905254, + "learning_rate": 0.00018933496756097428, + "loss": 0.80975789, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.28662109, + "step": 3753, + "time_per_iteration": 2.625577926635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074027, + "balance_loss_mlp": 1.04508317, + "epoch": 0.7222008464794152, + "flos": 815757765120.0, + "grad_norm": 0.06908288105531452, + "language_loss": 0.81582409, + "learning_rate": 0.0001890909192336603, + "loss": 0.82656443, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.28930664, + "step": 3754, + "time_per_iteration": 3.0871572494506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.04444289, + "epoch": 0.7223932281646788, + "flos": 748725244416.0, + "grad_norm": 0.057964315435078954, + "language_loss": 0.70292032, + "learning_rate": 0.00018884699161195623, + "loss": 0.71364796, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.28320312, + "step": 3755, + "time_per_iteration": 2.9729976654052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072672, + "balance_loss_mlp": 1.0435853, + "epoch": 0.7225856098499422, + "flos": 745132829184.0, + "grad_norm": 0.07379868606686546, + "language_loss": 0.7706269, + "learning_rate": 0.00018860318479056327, + "loss": 0.78135359, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.29077148, + "step": 3756, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073497, + "balance_loss_mlp": 1.04491067, + "epoch": 0.7227779915352058, + "flos": 547055894016.0, + "grad_norm": 0.05587751331143294, + "language_loss": 0.83529603, + "learning_rate": 0.00018835949886413555, + "loss": 0.84603095, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.28588867, + "step": 3757, + "time_per_iteration": 2.6880505084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074304, + "balance_loss_mlp": 1.04509711, + "epoch": 0.7229703732204694, + "flos": 530230496256.0, + "grad_norm": 0.08262826949591631, + "language_loss": 0.78295088, + "learning_rate": 0.0001881159339272806, + "loss": 0.7936939, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.29150391, + "step": 3758, + "time_per_iteration": 2.636491060256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_mlp": 1.04193068, + "epoch": 0.723162754905733, + "flos": 528103314432.0, + "grad_norm": 0.05735396724489517, + "language_loss": 0.78920448, + "learning_rate": 0.00018787249007455858, + "loss": 0.79990494, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.28100586, + "step": 3759, + "time_per_iteration": 2.5969340801239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_mlp": 1.04140496, + "epoch": 0.7233551365909965, + "flos": 654571860480.0, + "grad_norm": 0.07167982163737877, + "language_loss": 0.71580899, + "learning_rate": 0.00018762916740048302, + "loss": 0.72651339, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.28979492, + "step": 3760, + "time_per_iteration": 2.7852694988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071982, + "balance_loss_mlp": 1.04332376, + "epoch": 0.7235475182762601, + "flos": 522097118208.0, + "grad_norm": 0.05118431145994858, + "language_loss": 0.8598392, + "learning_rate": 0.0001873859659995195, + "loss": 0.87055904, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.28637695, + "step": 3761, + "time_per_iteration": 2.719606399536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107496, + "balance_loss_mlp": 1.04639769, + "epoch": 0.7237398999615237, + "flos": 608883941376.0, + "grad_norm": 0.051413796044389046, + "language_loss": 0.83093852, + "learning_rate": 0.0001871428859660878, + "loss": 0.84168816, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.28564453, + "step": 3762, + "time_per_iteration": 2.7558627128601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107143, + "balance_loss_mlp": 1.04329658, + "epoch": 0.7239322816467872, + "flos": 658664690688.0, + "grad_norm": 0.057793734831364726, + "language_loss": 0.81882715, + "learning_rate": 0.00018689992739455975, + "loss": 0.82954144, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.28149414, + "step": 3763, + "time_per_iteration": 2.90240740776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070949, + "balance_loss_mlp": 1.04131389, + "epoch": 0.7241246633320508, + "flos": 968869928448.0, + "grad_norm": 0.047782863980039225, + "language_loss": 0.85763133, + "learning_rate": 0.00018665709037926027, + "loss": 0.86834085, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.29614258, + "step": 3764, + "time_per_iteration": 3.3121178150177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069943, + "balance_loss_mlp": 1.04157126, + "epoch": 0.7243170450173143, + "flos": 514745114112.0, + "grad_norm": 0.06618029737842872, + "language_loss": 0.84513265, + "learning_rate": 0.00018641437501446694, + "loss": 0.8558321, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.28417969, + "step": 3765, + "time_per_iteration": 2.5711514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070577, + "balance_loss_mlp": 1.04172814, + "epoch": 0.7245094267025779, + "flos": 559482923520.0, + "grad_norm": 0.0702086558887849, + "language_loss": 0.82573164, + "learning_rate": 0.0001861717813944104, + "loss": 0.83643746, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.28833008, + "step": 3766, + "time_per_iteration": 2.6380386352539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072686, + "balance_loss_mlp": 1.04386163, + "epoch": 0.7247018083878415, + "flos": 612359903232.0, + "grad_norm": 0.0720480056079547, + "language_loss": 0.79527569, + "learning_rate": 0.00018592930961327365, + "loss": 0.8060025, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.28833008, + "step": 3767, + "time_per_iteration": 2.757380723953247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071618, + "balance_loss_mlp": 1.04238808, + "epoch": 0.7248941900731051, + "flos": 634379599872.0, + "grad_norm": 0.08594162637632567, + "language_loss": 0.87979633, + "learning_rate": 0.00018568695976519273, + "loss": 0.89051247, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.29199219, + "step": 3768, + "time_per_iteration": 2.793536424636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072488, + "balance_loss_mlp": 1.04332972, + "epoch": 0.7250865717583687, + "flos": 424718055936.0, + "grad_norm": 0.06891867665937222, + "language_loss": 0.80339336, + "learning_rate": 0.00018544473194425593, + "loss": 0.81411815, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.29125977, + "step": 3769, + "time_per_iteration": 2.5053606033325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_mlp": 1.03942966, + "epoch": 0.7252789534436321, + "flos": 634794236928.0, + "grad_norm": 0.0628085761222727, + "language_loss": 0.78636301, + "learning_rate": 0.00018520262624450485, + "loss": 0.79704964, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.29174805, + "step": 3770, + "time_per_iteration": 2.8609566688537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073738, + "balance_loss_mlp": 1.0450325, + "epoch": 0.7254713351288957, + "flos": 616895073792.0, + "grad_norm": 0.04686882151976468, + "language_loss": 0.87040436, + "learning_rate": 0.00018496064275993324, + "loss": 0.88114178, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.28710938, + "step": 3771, + "time_per_iteration": 2.754624605178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067155, + "balance_loss_mlp": 1.03916478, + "epoch": 0.7256637168141593, + "flos": 766662285312.0, + "grad_norm": 0.06312025626452938, + "language_loss": 0.81491023, + "learning_rate": 0.00018471878158448686, + "loss": 0.82558179, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.2800293, + "step": 3772, + "time_per_iteration": 2.9370291233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074719, + "balance_loss_mlp": 1.04641891, + "epoch": 0.7258560984994229, + "flos": 495268646400.0, + "grad_norm": 0.04821073170159266, + "language_loss": 0.83998889, + "learning_rate": 0.00018447704281206512, + "loss": 0.85073608, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.28344727, + "step": 3773, + "time_per_iteration": 2.8460988998413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073582, + "balance_loss_mlp": 1.04382753, + "epoch": 0.7260484801846864, + "flos": 529802712576.0, + "grad_norm": 0.22097506803040057, + "language_loss": 0.82744718, + "learning_rate": 0.0001842354265365191, + "loss": 0.83818305, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.29711914, + "step": 3774, + "time_per_iteration": 2.728426694869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107092, + "balance_loss_mlp": 1.04281068, + "epoch": 0.72624086186995, + "flos": 624679243776.0, + "grad_norm": 0.06612065150918205, + "language_loss": 0.8084085, + "learning_rate": 0.0001839939328516526, + "loss": 0.81911772, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.28100586, + "step": 3775, + "time_per_iteration": 2.730315923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074711, + "balance_loss_mlp": 1.04631519, + "epoch": 0.7264332435552135, + "flos": 716203468800.0, + "grad_norm": 0.06548969982492862, + "language_loss": 0.81234205, + "learning_rate": 0.0001837525618512218, + "loss": 0.82308918, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.28369141, + "step": 3776, + "time_per_iteration": 2.8894991874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069253, + "balance_loss_mlp": 1.04159606, + "epoch": 0.7266256252404771, + "flos": 680736821760.0, + "grad_norm": 0.059408980610910087, + "language_loss": 0.8289094, + "learning_rate": 0.00018351131362893519, + "loss": 0.83960199, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.27685547, + "step": 3777, + "time_per_iteration": 2.829299211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072659, + "balance_loss_mlp": 1.04423952, + "epoch": 0.7268180069257407, + "flos": 518654651904.0, + "grad_norm": 0.07569647287253554, + "language_loss": 0.8052032, + "learning_rate": 0.00018327018827845364, + "loss": 0.81592977, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.28417969, + "step": 3778, + "time_per_iteration": 2.605602502822876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070858, + "balance_loss_mlp": 1.04279566, + "epoch": 0.7270103886110042, + "flos": 512411318784.0, + "grad_norm": 0.07105004265912586, + "language_loss": 0.87327212, + "learning_rate": 0.00018302918589339036, + "loss": 0.88398075, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.28051758, + "step": 3779, + "time_per_iteration": 2.644178628921509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.04506147, + "epoch": 0.7272027702962678, + "flos": 546395355648.0, + "grad_norm": 0.05454287579555899, + "language_loss": 0.89820325, + "learning_rate": 0.00018278830656731054, + "loss": 0.90893972, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.28588867, + "step": 3780, + "time_per_iteration": 2.642853260040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067965, + "balance_loss_mlp": 1.03935504, + "epoch": 0.7273951519815314, + "flos": 592772926464.0, + "grad_norm": 0.049235223582258895, + "language_loss": 0.86383229, + "learning_rate": 0.00018254755039373222, + "loss": 0.87451196, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.28613281, + "step": 3781, + "time_per_iteration": 2.7858738899230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04377079, + "epoch": 0.727587533666795, + "flos": 605732456448.0, + "grad_norm": 0.06238056381578398, + "language_loss": 0.8331604, + "learning_rate": 0.0001823069174661252, + "loss": 0.84388638, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.2878418, + "step": 3782, + "time_per_iteration": 2.7796318531036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075989, + "balance_loss_mlp": 1.0479033, + "epoch": 0.7277799153520584, + "flos": 512770701312.0, + "grad_norm": 0.05705801102125677, + "language_loss": 0.78309739, + "learning_rate": 0.00018206640787791112, + "loss": 0.79385734, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.28125, + "step": 3783, + "time_per_iteration": 2.602808952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072706, + "balance_loss_mlp": 1.04411936, + "epoch": 0.727972297037322, + "flos": 537498132480.0, + "grad_norm": 0.06294847174499694, + "language_loss": 0.85954249, + "learning_rate": 0.00018182602172246416, + "loss": 0.87026954, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.28588867, + "step": 3784, + "time_per_iteration": 2.6015853881835938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076895, + "balance_loss_mlp": 1.04823709, + "epoch": 0.7281646787225856, + "flos": 534780223488.0, + "grad_norm": 0.06092859331592059, + "language_loss": 0.76170594, + "learning_rate": 0.00018158575909311075, + "loss": 0.77247488, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.28637695, + "step": 3785, + "time_per_iteration": 2.646030902862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010794, + "balance_loss_mlp": 1.05038452, + "epoch": 0.7283570604078492, + "flos": 624767993856.0, + "grad_norm": 0.06146036016272455, + "language_loss": 0.79553497, + "learning_rate": 0.000181345620083129, + "loss": 0.80632889, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.29003906, + "step": 3786, + "time_per_iteration": 2.792757034301758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.0520606, + "epoch": 0.7285494420931128, + "flos": 533904307200.0, + "grad_norm": 0.04915125322890423, + "language_loss": 0.86502135, + "learning_rate": 0.00018110560478574927, + "loss": 0.87582016, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.27856445, + "step": 3787, + "time_per_iteration": 2.6800973415374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074424, + "balance_loss_mlp": 1.04538465, + "epoch": 0.7287418237783763, + "flos": 666251011584.0, + "grad_norm": 0.0704647078753348, + "language_loss": 0.80134165, + "learning_rate": 0.0001808657132941533, + "loss": 0.81208593, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.2902832, + "step": 3788, + "time_per_iteration": 2.770371675491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075695, + "balance_loss_mlp": 1.04741848, + "epoch": 0.7289342054636399, + "flos": 550344181248.0, + "grad_norm": 0.07634779758427546, + "language_loss": 0.8289668, + "learning_rate": 0.00018062594570147572, + "loss": 0.83972371, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.28295898, + "step": 3789, + "time_per_iteration": 2.5850260257720947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077852, + "balance_loss_mlp": 1.05000448, + "epoch": 0.7291265871489034, + "flos": 687620344320.0, + "grad_norm": 0.05162370165887138, + "language_loss": 0.85260105, + "learning_rate": 0.00018038630210080243, + "loss": 0.8633796, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.27880859, + "step": 3790, + "time_per_iteration": 2.837209701538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075748, + "balance_loss_mlp": 1.04744744, + "epoch": 0.729318968834167, + "flos": 572388609024.0, + "grad_norm": 0.05876653681305703, + "language_loss": 0.849635, + "learning_rate": 0.0001801467825851712, + "loss": 0.86039245, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.28295898, + "step": 3791, + "time_per_iteration": 2.7689332962036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076086, + "balance_loss_mlp": 1.04778624, + "epoch": 0.7295113505194305, + "flos": 585786097152.0, + "grad_norm": 0.058290229022120006, + "language_loss": 0.7850548, + "learning_rate": 0.00017990738724757172, + "loss": 0.79581565, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.28320312, + "step": 3792, + "time_per_iteration": 2.870572090148926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078653, + "balance_loss_mlp": 1.05092454, + "epoch": 0.7297037322046941, + "flos": 706872669696.0, + "grad_norm": 0.05184173418469221, + "language_loss": 0.81961739, + "learning_rate": 0.00017966811618094598, + "loss": 0.83040386, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.27758789, + "step": 3793, + "time_per_iteration": 2.9314723014831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078553, + "balance_loss_mlp": 1.05044341, + "epoch": 0.7298961138899577, + "flos": 487039315968.0, + "grad_norm": 0.061838028009129596, + "language_loss": 0.8480593, + "learning_rate": 0.00017942896947818664, + "loss": 0.85884488, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.28125, + "step": 3794, + "time_per_iteration": 2.5791871547698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_mlp": 1.0351969, + "epoch": 0.7300884955752213, + "flos": 1365102222336.0, + "grad_norm": 0.022620155773541276, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75872123, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.11865234, + "step": 3795, + "time_per_iteration": 4.875161647796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071538, + "balance_loss_mlp": 1.04383409, + "epoch": 0.7302808772604849, + "flos": 531550162944.0, + "grad_norm": 0.07025171922085349, + "language_loss": 0.85040843, + "learning_rate": 0.00017895104953559947, + "loss": 0.8611238, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.27734375, + "step": 3796, + "time_per_iteration": 2.625335216522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077716, + "balance_loss_mlp": 1.05027366, + "epoch": 0.7304732589457483, + "flos": 435949074432.0, + "grad_norm": 0.07017117998144913, + "language_loss": 0.89488584, + "learning_rate": 0.00017871227648131672, + "loss": 0.90566301, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.27490234, + "step": 3797, + "time_per_iteration": 2.4892690181732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075327, + "balance_loss_mlp": 1.04743159, + "epoch": 0.7306656406310119, + "flos": 451376229888.0, + "grad_norm": 0.0555809148766967, + "language_loss": 0.82792765, + "learning_rate": 0.0001784736281619907, + "loss": 0.83868086, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.27905273, + "step": 3798, + "time_per_iteration": 2.616964101791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_mlp": 1.04964578, + "epoch": 0.7308580223162755, + "flos": 511756572672.0, + "grad_norm": 0.06137974721906842, + "language_loss": 0.74274546, + "learning_rate": 0.00017823510467027232, + "loss": 0.75351775, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.27636719, + "step": 3799, + "time_per_iteration": 2.744365692138672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074556, + "balance_loss_mlp": 1.04558766, + "epoch": 0.7310504040015391, + "flos": 375209349120.0, + "grad_norm": 0.06884438361049809, + "language_loss": 0.78208685, + "learning_rate": 0.00017799670609876516, + "loss": 0.79283237, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.28930664, + "step": 3800, + "time_per_iteration": 2.505571126937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072835, + "balance_loss_mlp": 1.04465413, + "epoch": 0.7312427856868026, + "flos": 549073976832.0, + "grad_norm": 0.05034282557889911, + "language_loss": 0.88874984, + "learning_rate": 0.00017775843254002366, + "loss": 0.8994782, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.28222656, + "step": 3801, + "time_per_iteration": 2.7557313442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076377, + "balance_loss_mlp": 1.04802942, + "epoch": 0.7314351673720662, + "flos": 766880483328.0, + "grad_norm": 0.053157012048244724, + "language_loss": 0.8399632, + "learning_rate": 0.00017752028408655367, + "loss": 0.85072702, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.28344727, + "step": 3802, + "time_per_iteration": 3.03664231300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074125, + "balance_loss_mlp": 1.04551435, + "epoch": 0.7316275490573297, + "flos": 486492258816.0, + "grad_norm": 0.05941466781290568, + "language_loss": 0.85240817, + "learning_rate": 0.00017728226083081272, + "loss": 0.8631494, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.28564453, + "step": 3803, + "time_per_iteration": 2.557260513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04554248, + "epoch": 0.7318199307425933, + "flos": 473183520768.0, + "grad_norm": 0.0569157917316084, + "language_loss": 0.8142879, + "learning_rate": 0.00017704436286520965, + "loss": 0.8250221, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.27929688, + "step": 3804, + "time_per_iteration": 2.531374454498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073136, + "balance_loss_mlp": 1.04500246, + "epoch": 0.7320123124278569, + "flos": 549202014720.0, + "grad_norm": 0.0615002003094314, + "language_loss": 0.84243524, + "learning_rate": 0.0001768065902821046, + "loss": 0.85316658, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.28149414, + "step": 3805, + "time_per_iteration": 2.7219231128692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070301, + "balance_loss_mlp": 1.04226291, + "epoch": 0.7322046941131204, + "flos": 570502946304.0, + "grad_norm": 0.050852375433721335, + "language_loss": 0.82159758, + "learning_rate": 0.00017656894317380907, + "loss": 0.83230054, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.28051758, + "step": 3806, + "time_per_iteration": 2.7360239028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019748, + "balance_loss_mlp": 1.00816071, + "epoch": 0.732397075798384, + "flos": 1468334559744.0, + "grad_norm": 0.009321700757662343, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77051014, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.11572266, + "step": 3807, + "time_per_iteration": 5.0339789390563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075379, + "balance_loss_mlp": 1.04662561, + "epoch": 0.7325894574836476, + "flos": 464620948992.0, + "grad_norm": 0.06770486672009031, + "language_loss": 0.83718252, + "learning_rate": 0.00017609402575064875, + "loss": 0.84793627, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.28710938, + "step": 3808, + "time_per_iteration": 2.5397021770477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073042, + "balance_loss_mlp": 1.04490852, + "epoch": 0.7327818391689112, + "flos": 495246887424.0, + "grad_norm": 0.07767281717141156, + "language_loss": 0.81099665, + "learning_rate": 0.00017585675562016367, + "loss": 0.8217271, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.28149414, + "step": 3809, + "time_per_iteration": 2.578652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019398, + "balance_loss_mlp": 1.00781119, + "epoch": 0.7329742208541746, + "flos": 1432694794752.0, + "grad_norm": 0.0100864336281573, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78232253, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.11572266, + "step": 3810, + "time_per_iteration": 4.869556903839111 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069034, + "balance_loss_mlp": 1.04092479, + "epoch": 0.7331666025394382, + "flos": 496645129728.0, + "grad_norm": 0.16551466638387613, + "language_loss": 0.85115767, + "learning_rate": 0.00017538259298196474, + "loss": 0.861848, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.28100586, + "step": 3811, + "time_per_iteration": 2.5746755599975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074051, + "balance_loss_mlp": 1.04551268, + "epoch": 0.7333589842247018, + "flos": 538247420928.0, + "grad_norm": 0.05568772928725353, + "language_loss": 0.81749296, + "learning_rate": 0.00017514570065833745, + "loss": 0.82823348, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.28540039, + "step": 3812, + "time_per_iteration": 2.74574613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073496, + "balance_loss_mlp": 1.04495704, + "epoch": 0.7335513659099654, + "flos": 490825198080.0, + "grad_norm": 0.06483425891488107, + "language_loss": 0.80511057, + "learning_rate": 0.00017490893445433426, + "loss": 0.81584549, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.28564453, + "step": 3813, + "time_per_iteration": 2.5976309776306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079068, + "balance_loss_mlp": 1.05026746, + "epoch": 0.733743747595229, + "flos": 561876355584.0, + "grad_norm": 0.07334965322780891, + "language_loss": 0.81267703, + "learning_rate": 0.00017467229446187587, + "loss": 0.82346773, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.2878418, + "step": 3814, + "time_per_iteration": 2.6907997131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078482, + "balance_loss_mlp": 1.05044413, + "epoch": 0.7339361292804925, + "flos": 538315822080.0, + "grad_norm": 0.052639307044854956, + "language_loss": 0.81764507, + "learning_rate": 0.00017443578077283424, + "loss": 0.82842994, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.28027344, + "step": 3815, + "time_per_iteration": 2.65816593170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077176, + "balance_loss_mlp": 1.04882812, + "epoch": 0.734128510965756, + "flos": 548198060544.0, + "grad_norm": 0.062049617931530306, + "language_loss": 0.84998393, + "learning_rate": 0.0001741993934790319, + "loss": 0.86075574, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.28344727, + "step": 3816, + "time_per_iteration": 2.738459348678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074176, + "balance_loss_mlp": 1.04594707, + "epoch": 0.7343208926510196, + "flos": 539783875584.0, + "grad_norm": 0.06367069815606033, + "language_loss": 0.8424527, + "learning_rate": 0.00017396313267224273, + "loss": 0.85319448, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.2824707, + "step": 3817, + "time_per_iteration": 2.7235686779022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_mlp": 1.05144763, + "epoch": 0.7345132743362832, + "flos": 570827423232.0, + "grad_norm": 0.05690847114233298, + "language_loss": 0.88229644, + "learning_rate": 0.0001737269984441912, + "loss": 0.89309394, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.28320312, + "step": 3818, + "time_per_iteration": 2.664562225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079058, + "balance_loss_mlp": 1.05140162, + "epoch": 0.7347056560215467, + "flos": 545135325696.0, + "grad_norm": 0.059530599678457814, + "language_loss": 0.85132968, + "learning_rate": 0.00017349099088655263, + "loss": 0.86212027, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.27661133, + "step": 3819, + "time_per_iteration": 2.713716506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079811, + "balance_loss_mlp": 1.05153477, + "epoch": 0.7348980377068103, + "flos": 595668335616.0, + "grad_norm": 0.07896802475478679, + "language_loss": 0.80594087, + "learning_rate": 0.00017325511009095375, + "loss": 0.81673896, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.28271484, + "step": 3820, + "time_per_iteration": 2.729605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075678, + "balance_loss_mlp": 1.04766417, + "epoch": 0.7350904193920739, + "flos": 538291090944.0, + "grad_norm": 0.05267126362138293, + "language_loss": 0.83587992, + "learning_rate": 0.00017301935614897113, + "loss": 0.84663677, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.28051758, + "step": 3821, + "time_per_iteration": 2.6848647594451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_mlp": 1.0488472, + "epoch": 0.7352828010773375, + "flos": 512712474624.0, + "grad_norm": 0.0534844061316339, + "language_loss": 0.81780893, + "learning_rate": 0.00017278372915213274, + "loss": 0.82857728, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.28027344, + "step": 3822, + "time_per_iteration": 2.650430679321289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_mlp": 1.01945734, + "epoch": 0.735475182762601, + "flos": 1552965087744.0, + "grad_norm": 0.013429842271997025, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80925179, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.11865234, + "step": 3823, + "time_per_iteration": 4.986204385757446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_mlp": 1.05139256, + "epoch": 0.7356675644478645, + "flos": 680984133120.0, + "grad_norm": 0.05755686388123544, + "language_loss": 0.80487376, + "learning_rate": 0.00017231285635975314, + "loss": 0.81566715, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.27929688, + "step": 3824, + "time_per_iteration": 2.952411413192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107638, + "balance_loss_mlp": 1.04755485, + "epoch": 0.7358599461331281, + "flos": 514961902080.0, + "grad_norm": 0.0735633923389538, + "language_loss": 0.82809317, + "learning_rate": 0.00017207761074702115, + "loss": 0.83885694, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.28808594, + "step": 3825, + "time_per_iteration": 2.6093246936798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080498, + "balance_loss_mlp": 1.05093431, + "epoch": 0.7360523278183917, + "flos": 443739036672.0, + "grad_norm": 0.05450452025217221, + "language_loss": 0.83744037, + "learning_rate": 0.0001718424924450514, + "loss": 0.84824538, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.29516602, + "step": 3826, + "time_per_iteration": 2.625596046447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072132, + "balance_loss_mlp": 1.04387975, + "epoch": 0.7362447095036553, + "flos": 603142585344.0, + "grad_norm": 0.04900180424478287, + "language_loss": 0.85697591, + "learning_rate": 0.00017160750154512482, + "loss": 0.86769724, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.2824707, + "step": 3827, + "time_per_iteration": 4.115647554397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077067, + "balance_loss_mlp": 1.04912448, + "epoch": 0.7364370911889189, + "flos": 552807424512.0, + "grad_norm": 0.04912825481573526, + "language_loss": 0.83176559, + "learning_rate": 0.0001713726381384731, + "loss": 0.84253627, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.27954102, + "step": 3828, + "time_per_iteration": 2.794640302658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070043, + "balance_loss_mlp": 1.04140913, + "epoch": 0.7366294728741823, + "flos": 448830028800.0, + "grad_norm": 0.06936682542859615, + "language_loss": 0.80874848, + "learning_rate": 0.00017113790231627812, + "loss": 0.81944889, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.28637695, + "step": 3829, + "time_per_iteration": 2.5032026767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023029, + "balance_loss_mlp": 1.01086962, + "epoch": 0.7368218545594459, + "flos": 1534705132032.0, + "grad_norm": 0.00938038964712245, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80281258, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.12158203, + "step": 3830, + "time_per_iteration": 4.790278911590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107551, + "balance_loss_mlp": 1.04701948, + "epoch": 0.7370142362447095, + "flos": 515164133376.0, + "grad_norm": 0.05667126288905575, + "language_loss": 0.81707335, + "learning_rate": 0.00017066881378973936, + "loss": 0.82782841, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.28491211, + "step": 3831, + "time_per_iteration": 2.6234376430511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072767, + "balance_loss_mlp": 1.0442524, + "epoch": 0.7372066179299731, + "flos": 500531346432.0, + "grad_norm": 0.05465479593854143, + "language_loss": 0.82744801, + "learning_rate": 0.00017043446126751189, + "loss": 0.83817565, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.28540039, + "step": 3832, + "time_per_iteration": 2.68343186378479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069311, + "balance_loss_mlp": 1.04089189, + "epoch": 0.7373989996152366, + "flos": 557814048768.0, + "grad_norm": 0.15091194873702685, + "language_loss": 0.76596999, + "learning_rate": 0.00017020023669397376, + "loss": 0.77666306, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.28442383, + "step": 3833, + "time_per_iteration": 2.709726572036743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080144, + "balance_loss_mlp": 1.05141497, + "epoch": 0.7375913813005002, + "flos": 506527368192.0, + "grad_norm": 0.054777149599410456, + "language_loss": 0.81358391, + "learning_rate": 0.0001699661401600589, + "loss": 0.82438534, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.28759766, + "step": 3834, + "time_per_iteration": 2.5703024864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074935, + "balance_loss_mlp": 1.04680145, + "epoch": 0.7377837629857638, + "flos": 485940819456.0, + "grad_norm": 0.05177646885601935, + "language_loss": 0.78090227, + "learning_rate": 0.00016973217175665205, + "loss": 0.79165161, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.28125, + "step": 3835, + "time_per_iteration": 2.567094564437866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_mlp": 1.02178645, + "epoch": 0.7379761446710273, + "flos": 1413900776448.0, + "grad_norm": 0.015599325923103721, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.8220011, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.12158203, + "step": 3836, + "time_per_iteration": 4.926120281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079166, + "balance_loss_mlp": 1.05046034, + "epoch": 0.7381685263562909, + "flos": 629445758976.0, + "grad_norm": 0.08209233600612638, + "language_loss": 0.83787167, + "learning_rate": 0.00016926461970465047, + "loss": 0.84866333, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.28710938, + "step": 3837, + "time_per_iteration": 2.8248865604400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.0512259, + "epoch": 0.7383609080415544, + "flos": 738869147136.0, + "grad_norm": 0.0447245395908081, + "language_loss": 0.84287, + "learning_rate": 0.00016903103623757516, + "loss": 0.85366273, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.28051758, + "step": 3838, + "time_per_iteration": 3.0732860565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076489, + "balance_loss_mlp": 1.04818845, + "epoch": 0.738553289726818, + "flos": 549945510912.0, + "grad_norm": 0.060261467227696625, + "language_loss": 0.801202, + "learning_rate": 0.00016879758126404738, + "loss": 0.8119669, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.28295898, + "step": 3839, + "time_per_iteration": 2.6999428272247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081913, + "balance_loss_mlp": 1.05420828, + "epoch": 0.7387456714120816, + "flos": 909925705728.0, + "grad_norm": 0.0717530150127342, + "language_loss": 0.80011249, + "learning_rate": 0.00016856425487470216, + "loss": 0.81093156, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.27758789, + "step": 3840, + "time_per_iteration": 3.0798532962799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_mlp": 1.047153, + "epoch": 0.7389380530973452, + "flos": 852308352000.0, + "grad_norm": 0.06037669736072389, + "language_loss": 0.79319191, + "learning_rate": 0.00016833105716012486, + "loss": 0.80394864, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.28540039, + "step": 3841, + "time_per_iteration": 3.125180244445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069813, + "balance_loss_mlp": 1.04144144, + "epoch": 0.7391304347826086, + "flos": 816678761472.0, + "grad_norm": 0.05821002881472178, + "language_loss": 0.84839195, + "learning_rate": 0.00016809798821085088, + "loss": 0.85909009, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.28344727, + "step": 3842, + "time_per_iteration": 2.9953746795654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082054, + "balance_loss_mlp": 1.05303824, + "epoch": 0.7393228164678722, + "flos": 572541378048.0, + "grad_norm": 0.054657255359861566, + "language_loss": 0.89063728, + "learning_rate": 0.00016786504811736565, + "loss": 0.90145791, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.28979492, + "step": 3843, + "time_per_iteration": 2.7037930488586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077429, + "balance_loss_mlp": 1.04869962, + "epoch": 0.7395151981531358, + "flos": 684903845376.0, + "grad_norm": 0.06408695288095054, + "language_loss": 0.82701367, + "learning_rate": 0.00016763223697010442, + "loss": 0.83778793, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.28710938, + "step": 3844, + "time_per_iteration": 2.9637320041656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107492, + "balance_loss_mlp": 1.0469532, + "epoch": 0.7397075798383994, + "flos": 556095711744.0, + "grad_norm": 0.05096747285284615, + "language_loss": 0.84036589, + "learning_rate": 0.00016739955485945256, + "loss": 0.85111511, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.2800293, + "step": 3845, + "time_per_iteration": 2.698608160018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255807, + "epoch": 0.739899961523663, + "flos": 546523393536.0, + "grad_norm": 0.07070386524494449, + "language_loss": 0.85914421, + "learning_rate": 0.00016716700187574513, + "loss": 0.86985326, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.28369141, + "step": 3846, + "time_per_iteration": 2.686567544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075336, + "balance_loss_mlp": 1.04787064, + "epoch": 0.7400923432089265, + "flos": 608913054720.0, + "grad_norm": 0.09697778830761983, + "language_loss": 0.83608466, + "learning_rate": 0.0001669345781092675, + "loss": 0.846838, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.27490234, + "step": 3847, + "time_per_iteration": 2.705946445465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075753, + "balance_loss_mlp": 1.04742908, + "epoch": 0.7402847248941901, + "flos": 590715555840.0, + "grad_norm": 0.07758942034588075, + "language_loss": 0.87070894, + "learning_rate": 0.0001667022836502546, + "loss": 0.88146651, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.28320312, + "step": 3848, + "time_per_iteration": 2.727207899093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074969, + "balance_loss_mlp": 1.04657388, + "epoch": 0.7404771065794536, + "flos": 477136728576.0, + "grad_norm": 0.06324539449596041, + "language_loss": 0.82776666, + "learning_rate": 0.00016647011858889077, + "loss": 0.83851635, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.28369141, + "step": 3849, + "time_per_iteration": 2.552164077758789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074172, + "balance_loss_mlp": 1.04577661, + "epoch": 0.7406694882647172, + "flos": 496192614912.0, + "grad_norm": 0.0765277016597007, + "language_loss": 0.86005962, + "learning_rate": 0.00016623808301531056, + "loss": 0.87080133, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.28417969, + "step": 3850, + "time_per_iteration": 2.6483278274536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073128, + "balance_loss_mlp": 1.04551888, + "epoch": 0.7408618699499807, + "flos": 561925817856.0, + "grad_norm": 0.06196174014296942, + "language_loss": 0.79140496, + "learning_rate": 0.00016600617701959842, + "loss": 0.8021363, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.27636719, + "step": 3851, + "time_per_iteration": 2.850390911102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024795, + "balance_loss_mlp": 1.01268303, + "epoch": 0.7410542516352443, + "flos": 1387421512704.0, + "grad_norm": 0.012000469023036765, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79868609, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.12109375, + "step": 3852, + "time_per_iteration": 5.050019979476929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074852, + "balance_loss_mlp": 1.04628921, + "epoch": 0.7412466333205079, + "flos": 669697860096.0, + "grad_norm": 0.08114806024349476, + "language_loss": 0.80909729, + "learning_rate": 0.00016554275412186315, + "loss": 0.8198458, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.28564453, + "step": 3853, + "time_per_iteration": 2.866884708404541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_mlp": 1.04265463, + "epoch": 0.7414390150057715, + "flos": 489038459904.0, + "grad_norm": 0.09161546445880692, + "language_loss": 0.80530989, + "learning_rate": 0.0001653112373997568, + "loss": 0.8160221, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.28588867, + "step": 3854, + "time_per_iteration": 2.6828300952911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075016, + "balance_loss_mlp": 1.04712129, + "epoch": 0.7416313966910351, + "flos": 599119566336.0, + "grad_norm": 0.06308625069628188, + "language_loss": 0.74284655, + "learning_rate": 0.0001650798506153517, + "loss": 0.75359672, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.27929688, + "step": 3855, + "time_per_iteration": 2.6935112476348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073152, + "balance_loss_mlp": 1.04473197, + "epoch": 0.7418237783762985, + "flos": 542279204352.0, + "grad_norm": 0.08209880324062359, + "language_loss": 0.84122801, + "learning_rate": 0.00016484859385848023, + "loss": 0.85195947, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.28442383, + "step": 3856, + "time_per_iteration": 2.620311975479126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073651, + "balance_loss_mlp": 1.04501677, + "epoch": 0.7420161600615621, + "flos": 543865121280.0, + "grad_norm": 0.06689669498305581, + "language_loss": 0.76970744, + "learning_rate": 0.0001646174672189243, + "loss": 0.78044391, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.28613281, + "step": 3857, + "time_per_iteration": 2.6914920806884766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106941, + "balance_loss_mlp": 1.04087138, + "epoch": 0.7422085417468257, + "flos": 526921860096.0, + "grad_norm": 0.07125061218981377, + "language_loss": 0.80480021, + "learning_rate": 0.00016438647078641488, + "loss": 0.8154943, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.28515625, + "step": 3858, + "time_per_iteration": 2.6275553703308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069955, + "balance_loss_mlp": 1.04103458, + "epoch": 0.7424009234320893, + "flos": 508404266496.0, + "grad_norm": 0.0650961492971168, + "language_loss": 0.83072245, + "learning_rate": 0.00016415560465063344, + "loss": 0.84142196, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.28930664, + "step": 3859, + "time_per_iteration": 2.732268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_mlp": 1.03886604, + "epoch": 0.7425933051173528, + "flos": 512347299840.0, + "grad_norm": 0.07578384946449068, + "language_loss": 0.78930503, + "learning_rate": 0.0001639248689012095, + "loss": 0.79998553, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.29101562, + "step": 3860, + "time_per_iteration": 2.571627378463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_mlp": 1.04188704, + "epoch": 0.7427856868026164, + "flos": 458034200064.0, + "grad_norm": 0.06018469098837617, + "language_loss": 0.87730241, + "learning_rate": 0.00016369426362772271, + "loss": 0.88801575, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.29394531, + "step": 3861, + "time_per_iteration": 2.803495407104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107102, + "balance_loss_mlp": 1.04219532, + "epoch": 0.74297806848788, + "flos": 604728502272.0, + "grad_norm": 0.05947124800099814, + "language_loss": 0.80541736, + "learning_rate": 0.00016346378891970233, + "loss": 0.81612754, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.28833008, + "step": 3862, + "time_per_iteration": 2.8671751022338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071209, + "balance_loss_mlp": 1.04183578, + "epoch": 0.7431704501731435, + "flos": 890971564032.0, + "grad_norm": 0.05726542490411253, + "language_loss": 0.80970359, + "learning_rate": 0.00016323344486662633, + "loss": 0.82041574, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.29345703, + "step": 3863, + "time_per_iteration": 3.310399055480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067129, + "balance_loss_mlp": 1.03808928, + "epoch": 0.7433628318584071, + "flos": 591867896832.0, + "grad_norm": 0.05550567007056857, + "language_loss": 0.7837103, + "learning_rate": 0.00016300323155792247, + "loss": 0.79438156, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.29003906, + "step": 3864, + "time_per_iteration": 2.9007768630981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065912, + "balance_loss_mlp": 1.03658676, + "epoch": 0.7435552135436706, + "flos": 476896619520.0, + "grad_norm": 0.0566624200483065, + "language_loss": 0.8859086, + "learning_rate": 0.00016277314908296687, + "loss": 0.8965677, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.29296875, + "step": 3865, + "time_per_iteration": 2.6249654293060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_mlp": 1.03741968, + "epoch": 0.7437475952289342, + "flos": 672874076160.0, + "grad_norm": 0.08514855435260649, + "language_loss": 0.76358485, + "learning_rate": 0.00016254319753108604, + "loss": 0.77424943, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.2902832, + "step": 3866, + "time_per_iteration": 2.816335678100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070215, + "balance_loss_mlp": 1.04029381, + "epoch": 0.7439399769141978, + "flos": 770094577152.0, + "grad_norm": 0.06451588447838245, + "language_loss": 0.76624024, + "learning_rate": 0.00016231337699155492, + "loss": 0.77694237, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.29858398, + "step": 3867, + "time_per_iteration": 2.9624359607696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068647, + "balance_loss_mlp": 1.03965509, + "epoch": 0.7441323585994614, + "flos": 647462785536.0, + "grad_norm": 0.05724025816545972, + "language_loss": 0.78232771, + "learning_rate": 0.0001620836875535977, + "loss": 0.79301417, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.28930664, + "step": 3868, + "time_per_iteration": 2.847935199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064683, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7443247402847248, + "flos": 565091859456.0, + "grad_norm": 0.05959682093806377, + "language_loss": 0.8083024, + "learning_rate": 0.00016185412930638766, + "loss": 0.81894922, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.29614258, + "step": 3869, + "time_per_iteration": 2.8403937816619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066357, + "balance_loss_mlp": 1.03738952, + "epoch": 0.7445171219699884, + "flos": 578243446272.0, + "grad_norm": 0.07528663769221765, + "language_loss": 0.82963836, + "learning_rate": 0.00016162470233904765, + "loss": 0.84030193, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.28955078, + "step": 3870, + "time_per_iteration": 2.7301175594329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_mlp": 1.03685129, + "epoch": 0.744709503655252, + "flos": 618588679680.0, + "grad_norm": 0.055174574386506046, + "language_loss": 0.8203845, + "learning_rate": 0.00016139540674064856, + "loss": 0.83104366, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.2902832, + "step": 3871, + "time_per_iteration": 2.728790760040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070553, + "balance_loss_mlp": 1.0411799, + "epoch": 0.7449018853405156, + "flos": 528355008000.0, + "grad_norm": 0.05299342012379109, + "language_loss": 0.77625883, + "learning_rate": 0.00016116624260021113, + "loss": 0.78696442, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.29321289, + "step": 3872, + "time_per_iteration": 2.7653627395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064492, + "balance_loss_mlp": 1.0351187, + "epoch": 0.7450942670257792, + "flos": 433088570880.0, + "grad_norm": 0.05882503001296847, + "language_loss": 0.8393743, + "learning_rate": 0.0001609372100067046, + "loss": 0.85001922, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.29345703, + "step": 3873, + "time_per_iteration": 2.556082010269165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_mlp": 1.03318477, + "epoch": 0.7452866487110427, + "flos": 696562647552.0, + "grad_norm": 0.0629532265793869, + "language_loss": 0.84404862, + "learning_rate": 0.0001607083090490475, + "loss": 0.85467416, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.29296875, + "step": 3874, + "time_per_iteration": 2.8703696727752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_mlp": 1.0391767, + "epoch": 0.7454790303963063, + "flos": 511944247296.0, + "grad_norm": 0.07079518805711353, + "language_loss": 0.79695952, + "learning_rate": 0.00016047953981610714, + "loss": 0.80764002, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.28857422, + "step": 3875, + "time_per_iteration": 2.7114357948303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006736, + "balance_loss_mlp": 0.99467212, + "epoch": 0.7456714120815698, + "flos": 1325221088256.0, + "grad_norm": 0.007120969619793637, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80736375, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.12060547, + "step": 3876, + "time_per_iteration": 4.9630632400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061746, + "balance_loss_mlp": 1.03232551, + "epoch": 0.7458637937668334, + "flos": 721397767680.0, + "grad_norm": 0.06112785741663116, + "language_loss": 0.81022239, + "learning_rate": 0.0001600223968795889, + "loss": 0.82083988, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.29394531, + "step": 3877, + "time_per_iteration": 2.8622119426727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006979, + "balance_loss_mlp": 0.99501073, + "epoch": 0.746056175452097, + "flos": 1500761793024.0, + "grad_norm": 0.005911171092350221, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76703048, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.11962891, + "step": 3878, + "time_per_iteration": 4.92147159576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064327, + "balance_loss_mlp": 1.03521585, + "epoch": 0.7462485571373605, + "flos": 519984493056.0, + "grad_norm": 0.0740832902187226, + "language_loss": 0.81523597, + "learning_rate": 0.00015956578190706483, + "loss": 0.82587922, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.29077148, + "step": 3879, + "time_per_iteration": 2.673748016357422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_mlp": 1.03529429, + "epoch": 0.7464409388226241, + "flos": 480967690752.0, + "grad_norm": 0.05926630999911606, + "language_loss": 0.75906825, + "learning_rate": 0.00015933767262892468, + "loss": 0.76971918, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.29760742, + "step": 3880, + "time_per_iteration": 2.7114145755767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069606, + "balance_loss_mlp": 1.03937459, + "epoch": 0.7466333205078877, + "flos": 486516989952.0, + "grad_norm": 0.07620522972756824, + "language_loss": 0.81981504, + "learning_rate": 0.00015910969560762927, + "loss": 0.83051109, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.30175781, + "step": 3881, + "time_per_iteration": 2.5965123176574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_mlp": 1.03790677, + "epoch": 0.7468257021931513, + "flos": 611015505408.0, + "grad_norm": 0.05603078059754119, + "language_loss": 0.83325368, + "learning_rate": 0.00015888185093168727, + "loss": 0.84393334, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.30053711, + "step": 3882, + "time_per_iteration": 2.732828378677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067064, + "balance_loss_mlp": 1.03709519, + "epoch": 0.7470180838784147, + "flos": 533204481024.0, + "grad_norm": 0.06025549136597994, + "language_loss": 0.8122552, + "learning_rate": 0.00015865413868955581, + "loss": 0.82292587, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.29931641, + "step": 3883, + "time_per_iteration": 2.6130521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066146, + "balance_loss_mlp": 1.03577161, + "epoch": 0.7472104655636783, + "flos": 739005949440.0, + "grad_norm": 0.0544206071008422, + "language_loss": 0.8260529, + "learning_rate": 0.00015842655896964054, + "loss": 0.83671433, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.30322266, + "step": 3884, + "time_per_iteration": 3.0686898231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068928, + "balance_loss_mlp": 1.03912604, + "epoch": 0.7474028472489419, + "flos": 640007474688.0, + "grad_norm": 0.07023161322090775, + "language_loss": 0.73560184, + "learning_rate": 0.00015819911186029567, + "loss": 0.7462911, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.29785156, + "step": 3885, + "time_per_iteration": 2.7895405292510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067979, + "balance_loss_mlp": 1.03808117, + "epoch": 0.7475952289342055, + "flos": 589980824064.0, + "grad_norm": 0.059238744927090525, + "language_loss": 0.86428809, + "learning_rate": 0.00015797179744982443, + "loss": 0.87496781, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.29833984, + "step": 3886, + "time_per_iteration": 2.7247395515441895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068837, + "balance_loss_mlp": 1.03986931, + "epoch": 0.7477876106194691, + "flos": 487935581184.0, + "grad_norm": 0.04858811748134261, + "language_loss": 0.78711867, + "learning_rate": 0.00015774461582647765, + "loss": 0.79780704, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.28930664, + "step": 3887, + "time_per_iteration": 2.633619785308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066597, + "balance_loss_mlp": 1.0372951, + "epoch": 0.7479799923047326, + "flos": 554470507008.0, + "grad_norm": 0.06558254439957789, + "language_loss": 0.80900019, + "learning_rate": 0.00015751756707845505, + "loss": 0.81966615, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.29272461, + "step": 3888, + "time_per_iteration": 2.606644630432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066336, + "balance_loss_mlp": 1.03703403, + "epoch": 0.7481723739899961, + "flos": 767037634560.0, + "grad_norm": 0.05503127509914209, + "language_loss": 0.88178474, + "learning_rate": 0.00015729065129390502, + "loss": 0.89244807, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.29296875, + "step": 3889, + "time_per_iteration": 2.997523784637451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067289, + "balance_loss_mlp": 1.03891718, + "epoch": 0.7483647556752597, + "flos": 495926364672.0, + "grad_norm": 0.06469395023850445, + "language_loss": 0.82209432, + "learning_rate": 0.0001570638685609241, + "loss": 0.83276725, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.28369141, + "step": 3890, + "time_per_iteration": 2.569988250732422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106895, + "balance_loss_mlp": 1.03950548, + "epoch": 0.7485571373605233, + "flos": 472607350272.0, + "grad_norm": 0.06811331087467534, + "language_loss": 0.80319339, + "learning_rate": 0.00015683721896755693, + "loss": 0.81388295, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.29443359, + "step": 3891, + "time_per_iteration": 2.5164339542388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026235, + "balance_loss_mlp": 1.01455247, + "epoch": 0.7487495190457868, + "flos": 1553619833856.0, + "grad_norm": 0.016089611749753062, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.8323673, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.11669922, + "step": 3892, + "time_per_iteration": 4.94329047203064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071321, + "balance_loss_mlp": 1.04273486, + "epoch": 0.7489419007310504, + "flos": 581566639104.0, + "grad_norm": 0.05717636586120892, + "language_loss": 0.85079896, + "learning_rate": 0.00015638431955158528, + "loss": 0.86151218, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.28588867, + "step": 3893, + "time_per_iteration": 2.6895976066589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_mlp": 1.03823924, + "epoch": 0.749134282416314, + "flos": 567297616896.0, + "grad_norm": 0.05490928633036113, + "language_loss": 0.80953169, + "learning_rate": 0.00015615806990481186, + "loss": 0.82020867, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.29394531, + "step": 3894, + "time_per_iteration": 2.7377114295959473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066836, + "balance_loss_mlp": 1.03796339, + "epoch": 0.7493266641015776, + "flos": 532786871808.0, + "grad_norm": 0.04620973196436286, + "language_loss": 0.843225, + "learning_rate": 0.00015593195374931452, + "loss": 0.8538934, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.28808594, + "step": 3895, + "time_per_iteration": 2.7463459968566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066974, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7495190457868411, + "flos": 523338209280.0, + "grad_norm": 0.06172140758760985, + "language_loss": 0.79870188, + "learning_rate": 0.00015570597117287922, + "loss": 0.80937159, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.29125977, + "step": 3896, + "time_per_iteration": 2.698322057723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065177, + "balance_loss_mlp": 1.03585148, + "epoch": 0.7497114274721046, + "flos": 513937598976.0, + "grad_norm": 0.06184521079833043, + "language_loss": 0.77818131, + "learning_rate": 0.0001554801222632406, + "loss": 0.78883302, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.29296875, + "step": 3897, + "time_per_iteration": 2.5883569717407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_mlp": 1.03872728, + "epoch": 0.7499038091573682, + "flos": 494759467008.0, + "grad_norm": 0.05373326836284952, + "language_loss": 0.8491286, + "learning_rate": 0.00015525440710808052, + "loss": 0.85980201, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.28588867, + "step": 3898, + "time_per_iteration": 2.628744125366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063318, + "balance_loss_mlp": 1.03415978, + "epoch": 0.7500961908426318, + "flos": 737326900224.0, + "grad_norm": 0.060715179246677825, + "language_loss": 0.77859104, + "learning_rate": 0.00015502882579502953, + "loss": 0.78922421, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.29101562, + "step": 3899, + "time_per_iteration": 2.9461636543273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106545, + "balance_loss_mlp": 1.03576672, + "epoch": 0.7502885725278954, + "flos": 533117140992.0, + "grad_norm": 0.04885018850646455, + "language_loss": 0.84403229, + "learning_rate": 0.00015480337841166592, + "loss": 0.85468674, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.29638672, + "step": 3900, + "time_per_iteration": 2.712470531463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071559, + "balance_loss_mlp": 1.04287767, + "epoch": 0.7504809542131589, + "flos": 589017567744.0, + "grad_norm": 0.062426881340490126, + "language_loss": 0.83192408, + "learning_rate": 0.00015457806504551647, + "loss": 0.84263968, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.28686523, + "step": 3901, + "time_per_iteration": 2.8195760250091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065, + "balance_loss_mlp": 1.0360322, + "epoch": 0.7506733358984224, + "flos": 511293883392.0, + "grad_norm": 0.11477974594715189, + "language_loss": 0.78299713, + "learning_rate": 0.0001543528857840554, + "loss": 0.79364717, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.28955078, + "step": 3902, + "time_per_iteration": 2.630005121231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069882, + "balance_loss_mlp": 1.04155791, + "epoch": 0.750865717583686, + "flos": 538990917120.0, + "grad_norm": 0.06709872205496833, + "language_loss": 0.80052483, + "learning_rate": 0.000154127840714705, + "loss": 0.81122363, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.28320312, + "step": 3903, + "time_per_iteration": 2.7631478309631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.03574109, + "epoch": 0.7510580992689496, + "flos": 476339387904.0, + "grad_norm": 0.0656362631946546, + "language_loss": 0.81441653, + "learning_rate": 0.00015390292992483557, + "loss": 0.82506168, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.28759766, + "step": 3904, + "time_per_iteration": 2.5295097827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069456, + "balance_loss_mlp": 1.0401783, + "epoch": 0.7512504809542132, + "flos": 578755597824.0, + "grad_norm": 0.05357678642302426, + "language_loss": 0.84239411, + "learning_rate": 0.00015367815350176523, + "loss": 0.85308868, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.29223633, + "step": 3905, + "time_per_iteration": 2.774902582168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065573, + "balance_loss_mlp": 1.03674817, + "epoch": 0.7514428626394767, + "flos": 418435435008.0, + "grad_norm": 0.052651193007747205, + "language_loss": 0.82780552, + "learning_rate": 0.00015345351153275987, + "loss": 0.83846122, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.28808594, + "step": 3906, + "time_per_iteration": 2.514157772064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068641, + "balance_loss_mlp": 1.03964877, + "epoch": 0.7516352443247403, + "flos": 640736414208.0, + "grad_norm": 0.05447043379457725, + "language_loss": 0.80753815, + "learning_rate": 0.00015322900410503332, + "loss": 0.81822455, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.28955078, + "step": 3907, + "time_per_iteration": 2.8011515140533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070258, + "balance_loss_mlp": 1.04150474, + "epoch": 0.7518276260100039, + "flos": 580700897280.0, + "grad_norm": 0.13484252880290531, + "language_loss": 0.77137792, + "learning_rate": 0.00015300463130574703, + "loss": 0.78208047, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.28710938, + "step": 3908, + "time_per_iteration": 2.8607709407806396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068663, + "balance_loss_mlp": 1.03983819, + "epoch": 0.7520200076952674, + "flos": 687025234944.0, + "grad_norm": 0.04704882043674688, + "language_loss": 0.82268852, + "learning_rate": 0.00015278039322201033, + "loss": 0.8333751, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.28808594, + "step": 3909, + "time_per_iteration": 2.9650497436523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_mlp": 1.04047048, + "epoch": 0.7522123893805309, + "flos": 486196895232.0, + "grad_norm": 0.0655524275561889, + "language_loss": 0.79742765, + "learning_rate": 0.00015255628994088004, + "loss": 0.80810893, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.27685547, + "step": 3910, + "time_per_iteration": 2.5476014614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073189, + "balance_loss_mlp": 1.04410195, + "epoch": 0.7524047710657945, + "flos": 818581800960.0, + "grad_norm": 0.059223553783327845, + "language_loss": 0.74873102, + "learning_rate": 0.00015233232154936082, + "loss": 0.75946289, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.29101562, + "step": 3911, + "time_per_iteration": 3.244593858718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070034, + "balance_loss_mlp": 1.04204392, + "epoch": 0.7525971527510581, + "flos": 699191806464.0, + "grad_norm": 0.05757806259910298, + "language_loss": 0.76233411, + "learning_rate": 0.0001521084881344048, + "loss": 0.77303445, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.27978516, + "step": 3912, + "time_per_iteration": 2.874175548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.03988528, + "epoch": 0.7527895344363217, + "flos": 633497891328.0, + "grad_norm": 0.058305123662607664, + "language_loss": 0.8657366, + "learning_rate": 0.00015188478978291208, + "loss": 0.87642109, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.28564453, + "step": 3913, + "time_per_iteration": 2.76914119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072508, + "balance_loss_mlp": 1.04387414, + "epoch": 0.7529819161215853, + "flos": 562555832832.0, + "grad_norm": 0.05696914319302461, + "language_loss": 0.8621434, + "learning_rate": 0.00015166122658173014, + "loss": 0.87286842, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.28637695, + "step": 3914, + "time_per_iteration": 2.7666819095611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069582, + "balance_loss_mlp": 1.04121017, + "epoch": 0.7531742978068487, + "flos": 690344045568.0, + "grad_norm": 0.05613078933144466, + "language_loss": 0.88230741, + "learning_rate": 0.00015143779861765332, + "loss": 0.89300323, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.28369141, + "step": 3915, + "time_per_iteration": 2.9440953731536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068792, + "balance_loss_mlp": 1.04058695, + "epoch": 0.7533666794921123, + "flos": 680800840704.0, + "grad_norm": 0.0540096565314657, + "language_loss": 0.81303173, + "learning_rate": 0.00015121450597742458, + "loss": 0.82371962, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.28198242, + "step": 3916, + "time_per_iteration": 2.8476526737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03871989, + "epoch": 0.7535590611773759, + "flos": 623384308224.0, + "grad_norm": 0.0625846652791648, + "language_loss": 0.78284335, + "learning_rate": 0.00015099134874773369, + "loss": 0.79351616, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.28613281, + "step": 3917, + "time_per_iteration": 2.7236275672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_mlp": 1.03791249, + "epoch": 0.7537514428626395, + "flos": 519162421248.0, + "grad_norm": 0.06623718225432344, + "language_loss": 0.80174196, + "learning_rate": 0.00015076832701521793, + "loss": 0.81240696, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.28588867, + "step": 3918, + "time_per_iteration": 2.7410969734191895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_mlp": 1.04238045, + "epoch": 0.753943824547903, + "flos": 723309571584.0, + "grad_norm": 0.06658372042006708, + "language_loss": 0.81702781, + "learning_rate": 0.000150545440866462, + "loss": 0.82773727, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.28540039, + "step": 3919, + "time_per_iteration": 2.9761922359466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_mlp": 1.04143584, + "epoch": 0.7541362062331666, + "flos": 437318203392.0, + "grad_norm": 0.07410111643216553, + "language_loss": 0.78494799, + "learning_rate": 0.000150322690387998, + "loss": 0.79563987, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.27758789, + "step": 3920, + "time_per_iteration": 2.516460657119751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071675, + "balance_loss_mlp": 1.04316044, + "epoch": 0.7543285879184302, + "flos": 565007491584.0, + "grad_norm": 0.05131276366098942, + "language_loss": 0.74961436, + "learning_rate": 0.00015010007566630535, + "loss": 0.76033103, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.28491211, + "step": 3921, + "time_per_iteration": 2.7329115867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071062, + "balance_loss_mlp": 1.04383469, + "epoch": 0.7545209696036937, + "flos": 520781833728.0, + "grad_norm": 0.07801712247115837, + "language_loss": 0.81558347, + "learning_rate": 0.00014987759678781077, + "loss": 0.82629412, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.27246094, + "step": 3922, + "time_per_iteration": 2.611708641052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071556, + "balance_loss_mlp": 1.04370856, + "epoch": 0.7547133512889573, + "flos": 615782020608.0, + "grad_norm": 0.05153768257221068, + "language_loss": 0.82422328, + "learning_rate": 0.00014965525383888795, + "loss": 0.83493882, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.27856445, + "step": 3923, + "time_per_iteration": 2.7729198932647705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072323, + "balance_loss_mlp": 1.04433274, + "epoch": 0.7549057329742208, + "flos": 750522157056.0, + "grad_norm": 0.0575234231525959, + "language_loss": 0.7209577, + "learning_rate": 0.00014943304690585851, + "loss": 0.73168093, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.2800293, + "step": 3924, + "time_per_iteration": 2.9442129135131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071679, + "balance_loss_mlp": 1.04378402, + "epoch": 0.7550981146594844, + "flos": 514193674752.0, + "grad_norm": 0.07421500953939195, + "language_loss": 0.79421008, + "learning_rate": 0.0001492109760749908, + "loss": 0.80492687, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.27905273, + "step": 3925, + "time_per_iteration": 2.643162965774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071892, + "balance_loss_mlp": 1.04392564, + "epoch": 0.755290496344748, + "flos": 521756674560.0, + "grad_norm": 0.059903848409534166, + "language_loss": 0.79955506, + "learning_rate": 0.00014898904143250002, + "loss": 0.81027395, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.27978516, + "step": 3926, + "time_per_iteration": 2.6683785915374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013114, + "balance_loss_mlp": 1.00157464, + "epoch": 0.7554828780300116, + "flos": 1413845521920.0, + "grad_norm": 0.014723160486699832, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76768315, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.11523438, + "step": 3927, + "time_per_iteration": 4.920205354690552 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071852, + "balance_loss_mlp": 1.04331291, + "epoch": 0.7556752597152752, + "flos": 556676264448.0, + "grad_norm": 0.05563270173237852, + "language_loss": 0.80196631, + "learning_rate": 0.0001485455810572474, + "loss": 0.81268483, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.28540039, + "step": 3928, + "time_per_iteration": 2.6541106700897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073533, + "balance_loss_mlp": 1.04499388, + "epoch": 0.7558676414005386, + "flos": 563363347968.0, + "grad_norm": 0.04999178273670638, + "language_loss": 0.84088999, + "learning_rate": 0.00014832405549665236, + "loss": 0.85162532, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.28564453, + "step": 3929, + "time_per_iteration": 2.6799492835998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070648, + "balance_loss_mlp": 1.04203749, + "epoch": 0.7560600230858022, + "flos": 561089189376.0, + "grad_norm": 0.061253165396126415, + "language_loss": 0.78636932, + "learning_rate": 0.00014810266646876746, + "loss": 0.79707581, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.28613281, + "step": 3930, + "time_per_iteration": 2.7644495964050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068729, + "balance_loss_mlp": 1.03980851, + "epoch": 0.7562524047710658, + "flos": 719232708096.0, + "grad_norm": 0.0768252646204266, + "language_loss": 0.77379584, + "learning_rate": 0.00014788141405954364, + "loss": 0.78448313, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.28930664, + "step": 3931, + "time_per_iteration": 2.996284246444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072748, + "balance_loss_mlp": 1.04418492, + "epoch": 0.7564447864563294, + "flos": 543086719488.0, + "grad_norm": 0.07792136157882237, + "language_loss": 0.84719956, + "learning_rate": 0.00014766029835487865, + "loss": 0.85792696, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.28564453, + "step": 3932, + "time_per_iteration": 2.7055630683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010723, + "balance_loss_mlp": 1.04326117, + "epoch": 0.7566371681415929, + "flos": 725484805632.0, + "grad_norm": 0.0830870815556461, + "language_loss": 0.79488772, + "learning_rate": 0.0001474393194406173, + "loss": 0.80561072, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.29052734, + "step": 3933, + "time_per_iteration": 2.8866286277770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075045, + "balance_loss_mlp": 1.04583836, + "epoch": 0.7568295498268565, + "flos": 576274825728.0, + "grad_norm": 0.06997934005865011, + "language_loss": 0.79262674, + "learning_rate": 0.00014721847740255112, + "loss": 0.80337715, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.29174805, + "step": 3934, + "time_per_iteration": 2.8177120685577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013149, + "balance_loss_mlp": 1.00151432, + "epoch": 0.75702193151212, + "flos": 1519273594368.0, + "grad_norm": 0.018539216642102736, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74925071, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.11621094, + "step": 3935, + "time_per_iteration": 4.663410186767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070009, + "balance_loss_mlp": 1.04085028, + "epoch": 0.7572143131973836, + "flos": 525218079744.0, + "grad_norm": 0.08081636486404137, + "language_loss": 0.7884202, + "learning_rate": 0.00014677720429790526, + "loss": 0.79912031, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.29125977, + "step": 3936, + "time_per_iteration": 2.5801281929016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106807, + "balance_loss_mlp": 1.03791022, + "epoch": 0.7574066948826472, + "flos": 550467836928.0, + "grad_norm": 0.05183566311050574, + "language_loss": 0.8430894, + "learning_rate": 0.0001465567734026429, + "loss": 0.85377008, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.30126953, + "step": 3937, + "time_per_iteration": 2.711367607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071543, + "balance_loss_mlp": 1.0420028, + "epoch": 0.7575990765679107, + "flos": 395682416640.0, + "grad_norm": 0.061048992240079196, + "language_loss": 0.82235777, + "learning_rate": 0.00014633647972621034, + "loss": 0.83307326, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.29492188, + "step": 3938, + "time_per_iteration": 2.4616081714630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067246, + "balance_loss_mlp": 1.03861201, + "epoch": 0.7577914582531743, + "flos": 584742855168.0, + "grad_norm": 0.05374365085178841, + "language_loss": 0.86112857, + "learning_rate": 0.00014611632335413354, + "loss": 0.87180108, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.28637695, + "step": 3939, + "time_per_iteration": 2.815455436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061142, + "balance_loss_mlp": 1.03296053, + "epoch": 0.7579838399384379, + "flos": 820604265984.0, + "grad_norm": 0.05753060969911492, + "language_loss": 0.82291019, + "learning_rate": 0.00014589630437188456, + "loss": 0.8335216, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.28222656, + "step": 3940, + "time_per_iteration": 3.190596580505371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_mlp": 1.03698504, + "epoch": 0.7581762216237015, + "flos": 443664843264.0, + "grad_norm": 0.07206463977261317, + "language_loss": 0.78593653, + "learning_rate": 0.00014567642286488253, + "loss": 0.79659057, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.28466797, + "step": 3941, + "time_per_iteration": 2.5607380867004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.04073191, + "epoch": 0.7583686033089649, + "flos": 540624886272.0, + "grad_norm": 0.06381401552287866, + "language_loss": 0.79120469, + "learning_rate": 0.00014545667891849258, + "loss": 0.80191505, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.30249023, + "step": 3942, + "time_per_iteration": 2.6117217540740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071107, + "balance_loss_mlp": 1.04192472, + "epoch": 0.7585609849942285, + "flos": 522332845056.0, + "grad_norm": 0.05226186971292142, + "language_loss": 0.82272542, + "learning_rate": 0.00014523707261802733, + "loss": 0.83343649, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.29174805, + "step": 3943, + "time_per_iteration": 2.665384292602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072852, + "balance_loss_mlp": 1.04448068, + "epoch": 0.7587533666794921, + "flos": 541599727104.0, + "grad_norm": 0.07358446075620559, + "language_loss": 0.81266546, + "learning_rate": 0.00014501760404874527, + "loss": 0.823394, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.28344727, + "step": 3944, + "time_per_iteration": 2.723860263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076046, + "balance_loss_mlp": 1.04698288, + "epoch": 0.7589457483647557, + "flos": 606131126784.0, + "grad_norm": 0.059139493232711386, + "language_loss": 0.85488701, + "learning_rate": 0.00014479827329585176, + "loss": 0.86564749, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.29052734, + "step": 3945, + "time_per_iteration": 2.6966402530670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070791, + "balance_loss_mlp": 1.04260945, + "epoch": 0.7591381300500193, + "flos": 554821125120.0, + "grad_norm": 0.05454852499248085, + "language_loss": 0.84753144, + "learning_rate": 0.00014457908044449846, + "loss": 0.85823941, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.28173828, + "step": 3946, + "time_per_iteration": 2.751542329788208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_mlp": 1.0412122, + "epoch": 0.7593305117352828, + "flos": 529399660032.0, + "grad_norm": 0.057352771815407315, + "language_loss": 0.82947516, + "learning_rate": 0.00014436002557978371, + "loss": 0.84016603, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.27856445, + "step": 3947, + "time_per_iteration": 2.8199281692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_mlp": 1.0139817, + "epoch": 0.7595228934205464, + "flos": 1502020412928.0, + "grad_norm": 0.01569529231199887, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77667999, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.10888672, + "step": 3948, + "time_per_iteration": 4.886767387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071405, + "balance_loss_mlp": 1.04217458, + "epoch": 0.7597152751058099, + "flos": 455290149888.0, + "grad_norm": 0.052184618076363286, + "language_loss": 0.79761183, + "learning_rate": 0.0001439223301503945, + "loss": 0.80832583, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.29223633, + "step": 3949, + "time_per_iteration": 2.524615526199341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107458, + "balance_loss_mlp": 1.04644656, + "epoch": 0.7599076567910735, + "flos": 685135190016.0, + "grad_norm": 0.06319987538441409, + "language_loss": 0.76281846, + "learning_rate": 0.00014370368975564834, + "loss": 0.77356422, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.28112793, + "step": 3950, + "time_per_iteration": 2.9306294918060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073257, + "balance_loss_mlp": 1.045434, + "epoch": 0.760100038476337, + "flos": 532092837888.0, + "grad_norm": 0.07868227598634299, + "language_loss": 0.83049744, + "learning_rate": 0.00014348518768739766, + "loss": 0.84123003, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.27832031, + "step": 3951, + "time_per_iteration": 2.7313663959503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_mlp": 1.01646149, + "epoch": 0.7602924201616006, + "flos": 1470952134144.0, + "grad_norm": 0.015467940128204082, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77755326, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.10839844, + "step": 3952, + "time_per_iteration": 4.869096994400024 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107331, + "balance_loss_mlp": 1.04593956, + "epoch": 0.7604848018468642, + "flos": 774280539648.0, + "grad_norm": 0.05530347415553069, + "language_loss": 0.86385798, + "learning_rate": 0.00014304859886964867, + "loss": 0.87459111, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.27441406, + "step": 3953, + "time_per_iteration": 3.04145884513855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.04591215, + "epoch": 0.7606771835321278, + "flos": 557917355520.0, + "grad_norm": 0.05036114884340379, + "language_loss": 0.83556843, + "learning_rate": 0.00014283051228964878, + "loss": 0.8463015, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.27416992, + "step": 3954, + "time_per_iteration": 2.694143772125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072565, + "balance_loss_mlp": 1.0455761, + "epoch": 0.7608695652173914, + "flos": 525139504128.0, + "grad_norm": 0.07332559246133831, + "language_loss": 0.82520175, + "learning_rate": 0.00014261256437514197, + "loss": 0.83592749, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.27026367, + "step": 3955, + "time_per_iteration": 2.644928455352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081954, + "balance_loss_mlp": 1.05405927, + "epoch": 0.7610619469026548, + "flos": 614757717504.0, + "grad_norm": 0.0938811683144382, + "language_loss": 0.82110238, + "learning_rate": 0.0001423947552107428, + "loss": 0.83192188, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.27929688, + "step": 3956, + "time_per_iteration": 2.7390809059143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.0495677, + "epoch": 0.7612543285879184, + "flos": 862992313344.0, + "grad_norm": 0.058156679645763765, + "language_loss": 0.77027428, + "learning_rate": 0.00014217708488101243, + "loss": 0.78105605, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.28637695, + "step": 3957, + "time_per_iteration": 3.068586587905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078853, + "balance_loss_mlp": 1.0505054, + "epoch": 0.761446710273182, + "flos": 553392359424.0, + "grad_norm": 0.051838175229669575, + "language_loss": 0.76812273, + "learning_rate": 0.0001419595534704579, + "loss": 0.77891129, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.28369141, + "step": 3958, + "time_per_iteration": 2.6755166053771973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078829, + "balance_loss_mlp": 1.05176806, + "epoch": 0.7616390919584456, + "flos": 467107513344.0, + "grad_norm": 0.08007848421566002, + "language_loss": 0.80974507, + "learning_rate": 0.00014174216106353237, + "loss": 0.82053339, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.27124023, + "step": 3959, + "time_per_iteration": 2.6076533794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077365, + "balance_loss_mlp": 1.04985189, + "epoch": 0.7618314736437091, + "flos": 498181584384.0, + "grad_norm": 0.05778330536162942, + "language_loss": 0.75894332, + "learning_rate": 0.00014152490774463512, + "loss": 0.76971698, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.27539062, + "step": 3960, + "time_per_iteration": 2.690720558166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079227, + "balance_loss_mlp": 1.05211914, + "epoch": 0.7620238553289727, + "flos": 434319487488.0, + "grad_norm": 0.07078023204432035, + "language_loss": 0.86778873, + "learning_rate": 0.00014130779359811135, + "loss": 0.87858105, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.27148438, + "step": 3961, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074576, + "balance_loss_mlp": 1.04672933, + "epoch": 0.7622162370142362, + "flos": 663962296320.0, + "grad_norm": 0.053637952879954945, + "language_loss": 0.85656244, + "learning_rate": 0.0001410908187082521, + "loss": 0.86730814, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.27856445, + "step": 3962, + "time_per_iteration": 2.8493921756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073404, + "balance_loss_mlp": 1.04527116, + "epoch": 0.7624086186994998, + "flos": 557700567552.0, + "grad_norm": 0.06361910700745704, + "language_loss": 0.82962865, + "learning_rate": 0.0001408739831592949, + "loss": 0.84036273, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.28149414, + "step": 3963, + "time_per_iteration": 2.670091152191162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072269, + "balance_loss_mlp": 1.04530358, + "epoch": 0.7626010003847634, + "flos": 628844857344.0, + "grad_norm": 0.06318704886131189, + "language_loss": 0.77098757, + "learning_rate": 0.0001406572870354224, + "loss": 0.78171021, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.27001953, + "step": 3964, + "time_per_iteration": 2.8136370182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076051, + "balance_loss_mlp": 1.04758406, + "epoch": 0.7627933820700269, + "flos": 437716873728.0, + "grad_norm": 0.08123777777865493, + "language_loss": 0.87067986, + "learning_rate": 0.00014044073042076337, + "loss": 0.88144034, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.28491211, + "step": 3965, + "time_per_iteration": 2.601212739944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077118, + "balance_loss_mlp": 1.04948556, + "epoch": 0.7629857637552905, + "flos": 532456602624.0, + "grad_norm": 0.044562098322040423, + "language_loss": 0.88958192, + "learning_rate": 0.00014022431339939302, + "loss": 0.90035319, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.27636719, + "step": 3966, + "time_per_iteration": 2.6651570796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_mlp": 1.04119754, + "epoch": 0.7631781454405541, + "flos": 679737249792.0, + "grad_norm": 0.09228261412980937, + "language_loss": 0.77959037, + "learning_rate": 0.00014000803605533163, + "loss": 0.79028082, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.27856445, + "step": 3967, + "time_per_iteration": 2.8413825035095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070975, + "balance_loss_mlp": 1.04367566, + "epoch": 0.7633705271258177, + "flos": 507246133248.0, + "grad_norm": 0.08332228620070425, + "language_loss": 0.83150613, + "learning_rate": 0.00013979189847254553, + "loss": 0.8422159, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.27294922, + "step": 3968, + "time_per_iteration": 2.578245162963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071362, + "balance_loss_mlp": 1.04282331, + "epoch": 0.7635629088110811, + "flos": 618574123008.0, + "grad_norm": 0.06392054280336681, + "language_loss": 0.80515426, + "learning_rate": 0.00013957590073494674, + "loss": 0.8158679, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.28540039, + "step": 3969, + "time_per_iteration": 2.7899181842803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069143, + "balance_loss_mlp": 1.04182076, + "epoch": 0.7637552904963447, + "flos": 638140750848.0, + "grad_norm": 0.08725250729100972, + "language_loss": 0.7866261, + "learning_rate": 0.0001393600429263931, + "loss": 0.7973175, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.2734375, + "step": 3970, + "time_per_iteration": 2.7429044246673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010211, + "balance_loss_mlp": 0.99867129, + "epoch": 0.7639476721816083, + "flos": 1562359905792.0, + "grad_norm": 0.0172148744606984, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75755095, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.11523438, + "step": 3971, + "time_per_iteration": 4.9502363204956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_mlp": 1.03834224, + "epoch": 0.7641400538668719, + "flos": 495729925632.0, + "grad_norm": 0.05751268278268784, + "language_loss": 0.81411171, + "learning_rate": 0.0001389287474315804, + "loss": 0.8247757, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.28076172, + "step": 3972, + "time_per_iteration": 2.6566832065582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_mlp": 1.04213631, + "epoch": 0.7643324355521355, + "flos": 578173635072.0, + "grad_norm": 0.05008758615727923, + "language_loss": 0.8002165, + "learning_rate": 0.00013871330991276505, + "loss": 0.81092072, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.28295898, + "step": 3973, + "time_per_iteration": 2.7023086547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071476, + "balance_loss_mlp": 1.04334247, + "epoch": 0.764524817237399, + "flos": 784472698368.0, + "grad_norm": 0.061481835950818894, + "language_loss": 0.80452615, + "learning_rate": 0.00013849801265788247, + "loss": 0.81524092, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.28149414, + "step": 3974, + "time_per_iteration": 2.997316837310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067079, + "balance_loss_mlp": 1.03861213, + "epoch": 0.7647171989226625, + "flos": 526025594880.0, + "grad_norm": 0.07226378616877399, + "language_loss": 0.82833815, + "learning_rate": 0.00013828285575051818, + "loss": 0.83900893, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.28466797, + "step": 3975, + "time_per_iteration": 2.588979721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062123, + "balance_loss_mlp": 1.03437066, + "epoch": 0.7649095806079261, + "flos": 554589780480.0, + "grad_norm": 0.06463560472951296, + "language_loss": 0.83791184, + "learning_rate": 0.0001380678392742035, + "loss": 0.84853303, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.27783203, + "step": 3976, + "time_per_iteration": 2.734581708908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061699, + "balance_loss_mlp": 1.03378069, + "epoch": 0.7651019622931897, + "flos": 648836296704.0, + "grad_norm": 0.05082413379641715, + "language_loss": 0.84568453, + "learning_rate": 0.00013785296331241526, + "loss": 0.85630155, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.27954102, + "step": 3977, + "time_per_iteration": 2.9020192623138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065476, + "balance_loss_mlp": 1.03727102, + "epoch": 0.7652943439784533, + "flos": 1046034971136.0, + "grad_norm": 0.0974531570465959, + "language_loss": 0.86962479, + "learning_rate": 0.00013763822794857583, + "loss": 0.88027954, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.28222656, + "step": 3978, + "time_per_iteration": 3.2940611839294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066351, + "balance_loss_mlp": 1.03847969, + "epoch": 0.7654867256637168, + "flos": 504085883904.0, + "grad_norm": 0.06678664441020601, + "language_loss": 0.89705759, + "learning_rate": 0.00013742363326605278, + "loss": 0.9077211, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.27880859, + "step": 3979, + "time_per_iteration": 2.717656135559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064419, + "balance_loss_mlp": 1.03473556, + "epoch": 0.7656791073489804, + "flos": 574422658560.0, + "grad_norm": 0.10335635669358377, + "language_loss": 0.78531003, + "learning_rate": 0.00013720917934815935, + "loss": 0.79595423, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.296875, + "step": 3980, + "time_per_iteration": 2.7627711296081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067901, + "balance_loss_mlp": 1.03960097, + "epoch": 0.765871489034244, + "flos": 492568266240.0, + "grad_norm": 0.07286561915101249, + "language_loss": 0.82861632, + "learning_rate": 0.00013699486627815344, + "loss": 0.83929539, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.28295898, + "step": 3981, + "time_per_iteration": 2.612478494644165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068523, + "balance_loss_mlp": 1.04024673, + "epoch": 0.7660638707195075, + "flos": 485769111552.0, + "grad_norm": 0.05570598750158071, + "language_loss": 0.82202697, + "learning_rate": 0.00013678069413923928, + "loss": 0.83271217, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.28320312, + "step": 3982, + "time_per_iteration": 2.586998701095581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067122, + "balance_loss_mlp": 1.03844047, + "epoch": 0.766256252404771, + "flos": 444059131392.0, + "grad_norm": 0.07121708811283338, + "language_loss": 0.81735259, + "learning_rate": 0.00013656666301456555, + "loss": 0.82802379, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.28662109, + "step": 3983, + "time_per_iteration": 2.574695587158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066984, + "balance_loss_mlp": 1.03906524, + "epoch": 0.7664486340900346, + "flos": 484922308608.0, + "grad_norm": 0.055314975613937604, + "language_loss": 0.83996785, + "learning_rate": 0.0001363527729872267, + "loss": 0.85063773, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.27929688, + "step": 3984, + "time_per_iteration": 2.6829311847686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069719, + "balance_loss_mlp": 1.04191911, + "epoch": 0.7666410157752982, + "flos": 645905981952.0, + "grad_norm": 0.061166263195475266, + "language_loss": 0.76441991, + "learning_rate": 0.00013613902414026207, + "loss": 0.77511704, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.27832031, + "step": 3985, + "time_per_iteration": 2.7802467346191406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067154, + "balance_loss_mlp": 1.03947425, + "epoch": 0.7668333974605618, + "flos": 773964827136.0, + "grad_norm": 0.05402447635552578, + "language_loss": 0.82339627, + "learning_rate": 0.00013592541655665642, + "loss": 0.83406782, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.27709961, + "step": 3986, + "time_per_iteration": 2.9866812229156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_mlp": 1.04255819, + "epoch": 0.7670257791458254, + "flos": 613200913920.0, + "grad_norm": 0.07328879507268711, + "language_loss": 0.85332406, + "learning_rate": 0.00013571195031933947, + "loss": 0.86403316, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.28320312, + "step": 3987, + "time_per_iteration": 2.673912525177002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016637, + "balance_loss_mlp": 1.00533557, + "epoch": 0.7672181608310888, + "flos": 1484608670208.0, + "grad_norm": 0.005208486185004438, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.81497979, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.11279297, + "step": 3988, + "time_per_iteration": 4.698279619216919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_mlp": 1.04217589, + "epoch": 0.7674105425163524, + "flos": 610449509376.0, + "grad_norm": 0.06677874529098146, + "language_loss": 0.85441434, + "learning_rate": 0.00013528544221501655, + "loss": 0.86511409, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.27832031, + "step": 3989, + "time_per_iteration": 2.7262814044952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079413, + "balance_loss_mlp": 1.05132711, + "epoch": 0.767602924201616, + "flos": 844857423360.0, + "grad_norm": 0.06376913662917556, + "language_loss": 0.81445122, + "learning_rate": 0.00013507240051359586, + "loss": 0.82524538, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.28100586, + "step": 3990, + "time_per_iteration": 3.0680136680603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076429, + "balance_loss_mlp": 1.04944038, + "epoch": 0.7677953058868796, + "flos": 526857841152.0, + "grad_norm": 0.06248947721820998, + "language_loss": 0.85939497, + "learning_rate": 0.00013485950048963425, + "loss": 0.87015927, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.27026367, + "step": 3991, + "time_per_iteration": 2.652700424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072898, + "balance_loss_mlp": 1.04629004, + "epoch": 0.7679876875721431, + "flos": 923161660416.0, + "grad_norm": 0.05838140649114419, + "language_loss": 0.82813108, + "learning_rate": 0.00013464674222578643, + "loss": 0.83886003, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.26660156, + "step": 3992, + "time_per_iteration": 3.199664354324341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078059, + "balance_loss_mlp": 1.05028319, + "epoch": 0.7681800692574067, + "flos": 457855289856.0, + "grad_norm": 0.060819943301615054, + "language_loss": 0.8307544, + "learning_rate": 0.00013443412580465292, + "loss": 0.84153497, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.27783203, + "step": 3993, + "time_per_iteration": 2.6216468811035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077791, + "balance_loss_mlp": 1.04999137, + "epoch": 0.7683724509426703, + "flos": 658113251328.0, + "grad_norm": 0.05683440391019819, + "language_loss": 0.83944607, + "learning_rate": 0.00013422165130877857, + "loss": 0.85022402, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.27807617, + "step": 3994, + "time_per_iteration": 2.8932595252990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077464, + "balance_loss_mlp": 1.05011749, + "epoch": 0.7685648326279338, + "flos": 555021946368.0, + "grad_norm": 0.058104534387139244, + "language_loss": 0.80272782, + "learning_rate": 0.00013400931882065327, + "loss": 0.81350249, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.27392578, + "step": 3995, + "time_per_iteration": 2.6307244300842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107594, + "balance_loss_mlp": 1.04768717, + "epoch": 0.7687572143131974, + "flos": 687070315008.0, + "grad_norm": 0.08323850441020555, + "language_loss": 0.80980253, + "learning_rate": 0.0001337971284227118, + "loss": 0.82056189, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.28222656, + "step": 3996, + "time_per_iteration": 3.022775411605835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_mlp": 1.01415932, + "epoch": 0.7689495959984609, + "flos": 1488653448192.0, + "grad_norm": 0.008597329334489423, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.7714355, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.10986328, + "step": 3997, + "time_per_iteration": 4.959140777587891 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073396, + "balance_loss_mlp": 1.0458113, + "epoch": 0.7691419776837245, + "flos": 570133389312.0, + "grad_norm": 0.05719845249799778, + "language_loss": 0.80268121, + "learning_rate": 0.0001333731742268438, + "loss": 0.81341517, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.27636719, + "step": 3998, + "time_per_iteration": 2.6925253868103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078887, + "balance_loss_mlp": 1.05142081, + "epoch": 0.7693343593689881, + "flos": 519812785152.0, + "grad_norm": 0.05688018347037518, + "language_loss": 0.85395527, + "learning_rate": 0.0001331614105935109, + "loss": 0.86474419, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.27514648, + "step": 3999, + "time_per_iteration": 2.653233051300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076072, + "balance_loss_mlp": 1.04843915, + "epoch": 0.7695267410542517, + "flos": 660086254080.0, + "grad_norm": 0.05160358655207702, + "language_loss": 0.84470475, + "learning_rate": 0.00013294978937954883, + "loss": 0.85546547, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.27685547, + "step": 4000, + "time_per_iteration": 2.776451349258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073445, + "balance_loss_mlp": 1.04562187, + "epoch": 0.7697191227395151, + "flos": 546548124672.0, + "grad_norm": 0.08124921192431957, + "language_loss": 0.8516435, + "learning_rate": 0.00013273831066711655, + "loss": 0.862378, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.27856445, + "step": 4001, + "time_per_iteration": 2.624626874923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075558, + "balance_loss_mlp": 1.04903352, + "epoch": 0.7699115044247787, + "flos": 540339697152.0, + "grad_norm": 0.06596404445695028, + "language_loss": 0.79911482, + "learning_rate": 0.00013252697453831747, + "loss": 0.80987036, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.26574707, + "step": 4002, + "time_per_iteration": 2.714096784591675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072085, + "balance_loss_mlp": 1.04480982, + "epoch": 0.7701038861100423, + "flos": 562635818496.0, + "grad_norm": 0.05249171180112231, + "language_loss": 0.82409763, + "learning_rate": 0.00013231578107519916, + "loss": 0.83481848, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.27319336, + "step": 4003, + "time_per_iteration": 2.8834095001220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073589, + "balance_loss_mlp": 1.04602814, + "epoch": 0.7702962677953059, + "flos": 481490016768.0, + "grad_norm": 0.06222122285204978, + "language_loss": 0.82945186, + "learning_rate": 0.00013210473035975422, + "loss": 0.84018773, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.27587891, + "step": 4004, + "time_per_iteration": 2.5676841735839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075243, + "balance_loss_mlp": 1.04756224, + "epoch": 0.7704886494805695, + "flos": 770036350464.0, + "grad_norm": 0.09382472586261968, + "language_loss": 0.85468185, + "learning_rate": 0.0001318938224739201, + "loss": 0.86543441, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.27734375, + "step": 4005, + "time_per_iteration": 3.032289743423462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072979, + "balance_loss_mlp": 1.04544115, + "epoch": 0.770681031165833, + "flos": 600912096768.0, + "grad_norm": 0.05515917324758249, + "language_loss": 0.83841556, + "learning_rate": 0.00013168305749957843, + "loss": 0.84914535, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.27587891, + "step": 4006, + "time_per_iteration": 2.739898920059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074884, + "balance_loss_mlp": 1.04765701, + "epoch": 0.7708734128510966, + "flos": 495862345728.0, + "grad_norm": 0.05387672734187661, + "language_loss": 0.8264026, + "learning_rate": 0.00013147243551855532, + "loss": 0.83715147, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.27270508, + "step": 4007, + "time_per_iteration": 2.5597212314605713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071212, + "balance_loss_mlp": 1.04398441, + "epoch": 0.7710657945363601, + "flos": 567012427776.0, + "grad_norm": 0.05404718589625755, + "language_loss": 0.80486447, + "learning_rate": 0.00013126195661262148, + "loss": 0.81557661, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.27270508, + "step": 4008, + "time_per_iteration": 2.744112968444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_mlp": 1.043365, + "epoch": 0.7712581762216237, + "flos": 604251256320.0, + "grad_norm": 0.04619128213129889, + "language_loss": 0.86330914, + "learning_rate": 0.00013105162086349216, + "loss": 0.87401342, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.27099609, + "step": 4009, + "time_per_iteration": 2.801823616027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072467, + "balance_loss_mlp": 1.04526305, + "epoch": 0.7714505579068872, + "flos": 530620402176.0, + "grad_norm": 0.04727817553621032, + "language_loss": 0.86132288, + "learning_rate": 0.00013084142835282687, + "loss": 0.8720476, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.2722168, + "step": 4010, + "time_per_iteration": 2.6556901931762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020489, + "balance_loss_mlp": 1.00937891, + "epoch": 0.7716429395921508, + "flos": 1421414313984.0, + "grad_norm": 0.005772893743499722, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80904853, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.11132812, + "step": 4011, + "time_per_iteration": 4.782922744750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073052, + "balance_loss_mlp": 1.04520464, + "epoch": 0.7718353212774144, + "flos": 578140139520.0, + "grad_norm": 0.05569724258515983, + "language_loss": 0.89507568, + "learning_rate": 0.0001304214733732485, + "loss": 0.90580624, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.27832031, + "step": 4012, + "time_per_iteration": 2.715064525604248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075719, + "balance_loss_mlp": 1.04758501, + "epoch": 0.772027702962678, + "flos": 510486368256.0, + "grad_norm": 0.06797042537174566, + "language_loss": 0.82429183, + "learning_rate": 0.00013021171106737672, + "loss": 0.83504903, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.28125, + "step": 4013, + "time_per_iteration": 2.658712863922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076283, + "balance_loss_mlp": 1.04867363, + "epoch": 0.7722200846479416, + "flos": 525391197696.0, + "grad_norm": 0.05000868356723149, + "language_loss": 0.7937907, + "learning_rate": 0.00013000209232605071, + "loss": 0.80455357, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.27636719, + "step": 4014, + "time_per_iteration": 2.6712594032287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073016, + "balance_loss_mlp": 1.04535961, + "epoch": 0.772412466333205, + "flos": 479348278272.0, + "grad_norm": 0.05134661435861922, + "language_loss": 0.79622269, + "learning_rate": 0.0001297926172306519, + "loss": 0.80695289, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.27685547, + "step": 4015, + "time_per_iteration": 2.610283613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071717, + "balance_loss_mlp": 1.04420376, + "epoch": 0.7726048480184686, + "flos": 905284256256.0, + "grad_norm": 0.05687508890981391, + "language_loss": 0.78788078, + "learning_rate": 0.0001295832858625055, + "loss": 0.79859793, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.27539062, + "step": 4016, + "time_per_iteration": 3.2706351280212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068156, + "balance_loss_mlp": 1.04054761, + "epoch": 0.7727972297037322, + "flos": 631085520384.0, + "grad_norm": 0.052610449581979135, + "language_loss": 0.69848269, + "learning_rate": 0.00012937409830288154, + "loss": 0.70916426, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.27636719, + "step": 4017, + "time_per_iteration": 2.8540306091308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_mlp": 1.04220808, + "epoch": 0.7729896113889958, + "flos": 414565185024.0, + "grad_norm": 0.0635987545876438, + "language_loss": 0.85103798, + "learning_rate": 0.00012916505463299362, + "loss": 0.86173213, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.27246094, + "step": 4018, + "time_per_iteration": 2.495150089263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070913, + "balance_loss_mlp": 1.0439713, + "epoch": 0.7731819930742593, + "flos": 668609538048.0, + "grad_norm": 0.05824058585066258, + "language_loss": 0.7791152, + "learning_rate": 0.00012895615493399972, + "loss": 0.78982437, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.26977539, + "step": 4019, + "time_per_iteration": 2.813354015350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070454, + "balance_loss_mlp": 1.04334593, + "epoch": 0.7733743747595229, + "flos": 489604455936.0, + "grad_norm": 0.0836786402257782, + "language_loss": 0.82400632, + "learning_rate": 0.00012874739928700192, + "loss": 0.83471084, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.27148438, + "step": 4020, + "time_per_iteration": 2.559576988220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068211, + "balance_loss_mlp": 1.0395534, + "epoch": 0.7735667564447865, + "flos": 659294705664.0, + "grad_norm": 0.06159530150970634, + "language_loss": 0.79701376, + "learning_rate": 0.00012853878777304624, + "loss": 0.80769587, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.28662109, + "step": 4021, + "time_per_iteration": 2.8569796085357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072395, + "balance_loss_mlp": 1.04528701, + "epoch": 0.77375913813005, + "flos": 533106966528.0, + "grad_norm": 0.052906319794948725, + "language_loss": 0.84479654, + "learning_rate": 0.000128330320473123, + "loss": 0.85552055, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.27172852, + "step": 4022, + "time_per_iteration": 2.715498208999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013154, + "balance_loss_mlp": 1.0020442, + "epoch": 0.7739515198153136, + "flos": 1519260447744.0, + "grad_norm": 0.015943225392078396, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.79345053, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.11132812, + "step": 4023, + "time_per_iteration": 4.888492107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073117, + "balance_loss_mlp": 1.04610443, + "epoch": 0.7741439015005771, + "flos": 639819800064.0, + "grad_norm": 0.06091537077025671, + "language_loss": 0.81350756, + "learning_rate": 0.0001279138188390543, + "loss": 0.82423878, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.27050781, + "step": 4024, + "time_per_iteration": 2.766850233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073924, + "balance_loss_mlp": 1.04622006, + "epoch": 0.7743362831858407, + "flos": 665546803200.0, + "grad_norm": 0.05776515915351722, + "language_loss": 0.86359525, + "learning_rate": 0.00012770578466660915, + "loss": 0.87433445, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.27758789, + "step": 4025, + "time_per_iteration": 2.8906335830688477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074784, + "balance_loss_mlp": 1.04703164, + "epoch": 0.7745286648711043, + "flos": 562453936128.0, + "grad_norm": 0.05700523887714171, + "language_loss": 0.81593072, + "learning_rate": 0.0001274978950315968, + "loss": 0.82667857, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.27807617, + "step": 4026, + "time_per_iteration": 2.8301045894622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.05058098, + "epoch": 0.7747210465563679, + "flos": 516651125760.0, + "grad_norm": 0.0689539575699981, + "language_loss": 0.82650018, + "learning_rate": 0.00012729015001472716, + "loss": 0.83727849, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.27258301, + "step": 4027, + "time_per_iteration": 2.6426851749420166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071106, + "balance_loss_mlp": 1.04371142, + "epoch": 0.7749134282416313, + "flos": 633921292800.0, + "grad_norm": 0.05627311162483831, + "language_loss": 0.81452388, + "learning_rate": 0.00012708254969665418, + "loss": 0.82523495, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.27416992, + "step": 4028, + "time_per_iteration": 2.7853105068206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107679, + "balance_loss_mlp": 1.04922891, + "epoch": 0.7751058099268949, + "flos": 495118849536.0, + "grad_norm": 0.06575328123428556, + "language_loss": 0.83176428, + "learning_rate": 0.00012687509415797526, + "loss": 0.84253216, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.27587891, + "step": 4029, + "time_per_iteration": 2.5962271690368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075768, + "balance_loss_mlp": 1.04808736, + "epoch": 0.7752981916121585, + "flos": 510048410112.0, + "grad_norm": 0.0626546531948414, + "language_loss": 0.81091148, + "learning_rate": 0.00012666778347923208, + "loss": 0.82166916, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.27709961, + "step": 4030, + "time_per_iteration": 2.647709369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078164, + "balance_loss_mlp": 1.04998243, + "epoch": 0.7754905732974221, + "flos": 497295493632.0, + "grad_norm": 0.044509265947171146, + "language_loss": 0.83753759, + "learning_rate": 0.0001264606177409092, + "loss": 0.84831923, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.28198242, + "step": 4031, + "time_per_iteration": 2.6404576301574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_mlp": 1.04437256, + "epoch": 0.7756829549826857, + "flos": 480486062592.0, + "grad_norm": 0.05920145784509139, + "language_loss": 0.85917544, + "learning_rate": 0.00012625359702343609, + "loss": 0.86988962, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.27075195, + "step": 4032, + "time_per_iteration": 2.7071335315704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107336, + "balance_loss_mlp": 1.04641843, + "epoch": 0.7758753366679492, + "flos": 552368056320.0, + "grad_norm": 0.0993215607804505, + "language_loss": 0.84452856, + "learning_rate": 0.00012604672140718504, + "loss": 0.85526216, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.26965332, + "step": 4033, + "time_per_iteration": 2.6153743267059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075256, + "balance_loss_mlp": 1.04738498, + "epoch": 0.7760677183532128, + "flos": 703529127936.0, + "grad_norm": 0.05917686409446163, + "language_loss": 0.77727896, + "learning_rate": 0.00012583999097247233, + "loss": 0.78803158, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.27905273, + "step": 4034, + "time_per_iteration": 2.876141309738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075852, + "balance_loss_mlp": 1.04836273, + "epoch": 0.7762601000384763, + "flos": 523218935808.0, + "grad_norm": 0.07262055787937163, + "language_loss": 0.80052263, + "learning_rate": 0.0001256334057995578, + "loss": 0.8112812, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.27514648, + "step": 4035, + "time_per_iteration": 2.7490179538726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072491, + "balance_loss_mlp": 1.04526329, + "epoch": 0.7764524817237399, + "flos": 557262609408.0, + "grad_norm": 0.050638434505961206, + "language_loss": 0.8468259, + "learning_rate": 0.000125426965968645, + "loss": 0.8575508, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.27294922, + "step": 4036, + "time_per_iteration": 2.7155818939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077213, + "balance_loss_mlp": 1.04967546, + "epoch": 0.7766448634090035, + "flos": 579454013952.0, + "grad_norm": 0.06589986489431957, + "language_loss": 0.82292032, + "learning_rate": 0.00012522067155988092, + "loss": 0.83369249, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.27563477, + "step": 4037, + "time_per_iteration": 2.712575912475586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072416, + "balance_loss_mlp": 1.0448314, + "epoch": 0.776837245094267, + "flos": 635300596224.0, + "grad_norm": 0.05822255331252486, + "language_loss": 0.75269878, + "learning_rate": 0.00012501452265335617, + "loss": 0.76342297, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.27612305, + "step": 4038, + "time_per_iteration": 2.8041534423828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_mlp": 1.04810321, + "epoch": 0.7770296267795306, + "flos": 614398334976.0, + "grad_norm": 0.05653078531335044, + "language_loss": 0.82581437, + "learning_rate": 0.0001248085193291047, + "loss": 0.83656931, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.27441406, + "step": 4039, + "time_per_iteration": 2.7838690280914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107999, + "balance_loss_mlp": 1.05230999, + "epoch": 0.7772220084647942, + "flos": 878438407680.0, + "grad_norm": 0.05606519790253506, + "language_loss": 0.82265162, + "learning_rate": 0.00012460266166710443, + "loss": 0.83345151, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.27734375, + "step": 4040, + "time_per_iteration": 3.1491823196411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077311, + "balance_loss_mlp": 1.04989266, + "epoch": 0.7774143901500578, + "flos": 839293567488.0, + "grad_norm": 0.05703190402159479, + "language_loss": 0.77674973, + "learning_rate": 0.00012439694974727633, + "loss": 0.78752279, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.27441406, + "step": 4041, + "time_per_iteration": 3.0976173877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070951, + "balance_loss_mlp": 1.04358041, + "epoch": 0.7776067718353212, + "flos": 567878169600.0, + "grad_norm": 0.05364031630438029, + "language_loss": 0.80233228, + "learning_rate": 0.00012419138364948458, + "loss": 0.81304181, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.27392578, + "step": 4042, + "time_per_iteration": 2.7326791286468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070758, + "balance_loss_mlp": 1.04345894, + "epoch": 0.7777991535205848, + "flos": 745627603968.0, + "grad_norm": 0.0558907311125614, + "language_loss": 0.82470769, + "learning_rate": 0.00012398596345353702, + "loss": 0.83541524, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.2734375, + "step": 4043, + "time_per_iteration": 2.8787214756011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075612, + "balance_loss_mlp": 1.04824162, + "epoch": 0.7779915352058484, + "flos": 537799288320.0, + "grad_norm": 0.06132046127544376, + "language_loss": 0.83480489, + "learning_rate": 0.0001237806892391851, + "loss": 0.84556091, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.27416992, + "step": 4044, + "time_per_iteration": 2.7494754791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072105, + "balance_loss_mlp": 1.04540193, + "epoch": 0.778183916891112, + "flos": 634497463296.0, + "grad_norm": 0.05685464217024924, + "language_loss": 0.80689287, + "learning_rate": 0.0001235755610861233, + "loss": 0.81761396, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.26757812, + "step": 4045, + "time_per_iteration": 2.812063694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107437, + "balance_loss_mlp": 1.04711854, + "epoch": 0.7783762985763756, + "flos": 588400699392.0, + "grad_norm": 0.053935102157053175, + "language_loss": 0.85224533, + "learning_rate": 0.0001233705790739893, + "loss": 0.86298895, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.27270508, + "step": 4046, + "time_per_iteration": 2.7485461235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074247, + "balance_loss_mlp": 1.04697168, + "epoch": 0.7785686802616391, + "flos": 930261970944.0, + "grad_norm": 0.0673066847398555, + "language_loss": 0.74977076, + "learning_rate": 0.0001231657432823643, + "loss": 0.76051325, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.27319336, + "step": 4047, + "time_per_iteration": 3.1984071731567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074697, + "balance_loss_mlp": 1.04661131, + "epoch": 0.7787610619469026, + "flos": 497679607296.0, + "grad_norm": 0.06151594222397662, + "language_loss": 0.78487623, + "learning_rate": 0.0001229610537907725, + "loss": 0.79562324, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.28100586, + "step": 4048, + "time_per_iteration": 2.6014962196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072933, + "balance_loss_mlp": 1.04379785, + "epoch": 0.7789534436321662, + "flos": 515385303552.0, + "grad_norm": 0.0760421254177005, + "language_loss": 0.90244645, + "learning_rate": 0.00012275651067868143, + "loss": 0.91317576, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.29077148, + "step": 4049, + "time_per_iteration": 2.598532199859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04196286, + "epoch": 0.7791458253174298, + "flos": 988081555968.0, + "grad_norm": 0.05867585212414032, + "language_loss": 0.80266809, + "learning_rate": 0.00012255211402550182, + "loss": 0.81336522, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.27807617, + "step": 4050, + "time_per_iteration": 3.223078727722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070769, + "balance_loss_mlp": 1.04299307, + "epoch": 0.7793382070026933, + "flos": 628756107264.0, + "grad_norm": 0.07400928475776686, + "language_loss": 0.76817232, + "learning_rate": 0.00012234786391058727, + "loss": 0.77888, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.27783203, + "step": 4051, + "time_per_iteration": 4.367919683456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073785, + "balance_loss_mlp": 1.04565179, + "epoch": 0.7795305886879569, + "flos": 531500700672.0, + "grad_norm": 0.08184044182039507, + "language_loss": 0.84765863, + "learning_rate": 0.0001221437604132352, + "loss": 0.85839653, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.28149414, + "step": 4052, + "time_per_iteration": 2.619694948196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04369259, + "epoch": 0.7797229703732205, + "flos": 611690600448.0, + "grad_norm": 0.061094221003680546, + "language_loss": 0.81091797, + "learning_rate": 0.0001219398036126852, + "loss": 0.82162666, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.2722168, + "step": 4053, + "time_per_iteration": 2.7424631118774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072046, + "balance_loss_mlp": 1.04391217, + "epoch": 0.7799153520584841, + "flos": 871758526464.0, + "grad_norm": 0.051190100857480304, + "language_loss": 0.77992457, + "learning_rate": 0.00012173599358812027, + "loss": 0.790645, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.28149414, + "step": 4054, + "time_per_iteration": 3.277557849884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070645, + "balance_loss_mlp": 1.04303575, + "epoch": 0.7801077337437476, + "flos": 583348995072.0, + "grad_norm": 0.06092142653213725, + "language_loss": 0.82466495, + "learning_rate": 0.0001215323304186668, + "loss": 0.83537143, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.27587891, + "step": 4055, + "time_per_iteration": 2.7477025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074017, + "balance_loss_mlp": 1.0459547, + "epoch": 0.7803001154290111, + "flos": 600887365632.0, + "grad_norm": 0.06830093744875644, + "language_loss": 0.8764962, + "learning_rate": 0.00012132881418339364, + "loss": 0.88723636, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.28076172, + "step": 4056, + "time_per_iteration": 2.7418453693389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009047, + "balance_loss_mlp": 0.99779409, + "epoch": 0.7804924971142747, + "flos": 1478743506432.0, + "grad_norm": 0.016207473772952577, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.7852661, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.11230469, + "step": 4057, + "time_per_iteration": 4.85454535484314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065459, + "balance_loss_mlp": 1.03661036, + "epoch": 0.7806848787995383, + "flos": 630075773952.0, + "grad_norm": 0.062259886670719244, + "language_loss": 0.77044684, + "learning_rate": 0.00012092222283137944, + "loss": 0.78110135, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.28833008, + "step": 4058, + "time_per_iteration": 2.764766216278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008333, + "balance_loss_mlp": 0.99707937, + "epoch": 0.7808772604848019, + "flos": 1416800567808.0, + "grad_norm": 0.01618194632849119, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79914641, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.11230469, + "step": 4059, + "time_per_iteration": 4.825545310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069706, + "balance_loss_mlp": 1.0414772, + "epoch": 0.7810696421700654, + "flos": 731345435136.0, + "grad_norm": 0.07523837399490399, + "language_loss": 0.83462268, + "learning_rate": 0.00012051622016348856, + "loss": 0.84531975, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.2824707, + "step": 4060, + "time_per_iteration": 3.045809507369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_mlp": 1.04018903, + "epoch": 0.781262023855329, + "flos": 424718055936.0, + "grad_norm": 0.06174241135408443, + "language_loss": 0.84242803, + "learning_rate": 0.00012031343978315539, + "loss": 0.85311675, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.28662109, + "step": 4061, + "time_per_iteration": 2.4845006465911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_mlp": 1.04099798, + "epoch": 0.7814544055405925, + "flos": 500767073280.0, + "grad_norm": 0.1392477950837379, + "language_loss": 0.82486379, + "learning_rate": 0.00012011080681021774, + "loss": 0.83554912, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.27563477, + "step": 4062, + "time_per_iteration": 2.6524341106414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070849, + "balance_loss_mlp": 1.04295421, + "epoch": 0.7816467872258561, + "flos": 462212960256.0, + "grad_norm": 0.07233679581194719, + "language_loss": 0.86375731, + "learning_rate": 0.00011990832132334512, + "loss": 0.87446582, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.27954102, + "step": 4063, + "time_per_iteration": 2.519162654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069214, + "balance_loss_mlp": 1.04112792, + "epoch": 0.7818391689111197, + "flos": 740497324032.0, + "grad_norm": 0.07068900898467687, + "language_loss": 0.82369703, + "learning_rate": 0.00011970598340114897, + "loss": 0.83438915, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.28100586, + "step": 4064, + "time_per_iteration": 2.9242045879364014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067385, + "balance_loss_mlp": 1.03875041, + "epoch": 0.7820315505963832, + "flos": 547386163200.0, + "grad_norm": 0.07366274029850052, + "language_loss": 0.83860916, + "learning_rate": 0.00011950379312218396, + "loss": 0.84928298, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.28637695, + "step": 4065, + "time_per_iteration": 2.7022647857666016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070383, + "balance_loss_mlp": 1.04191554, + "epoch": 0.7822239322816468, + "flos": 728665403904.0, + "grad_norm": 0.07812712198170087, + "language_loss": 0.86016601, + "learning_rate": 0.00011930175056494719, + "loss": 0.87086987, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.28466797, + "step": 4066, + "time_per_iteration": 2.885648488998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071634, + "balance_loss_mlp": 1.04276156, + "epoch": 0.7824163139669104, + "flos": 451774900224.0, + "grad_norm": 0.0475815127648597, + "language_loss": 0.75548607, + "learning_rate": 0.00011909985580787885, + "loss": 0.76620239, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.28881836, + "step": 4067, + "time_per_iteration": 2.717013120651245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.0379895, + "epoch": 0.782608695652174, + "flos": 540207277056.0, + "grad_norm": 0.05385008636564137, + "language_loss": 0.80856502, + "learning_rate": 0.00011889810892936137, + "loss": 0.8192274, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.28295898, + "step": 4068, + "time_per_iteration": 2.7350502014160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.04105842, + "epoch": 0.7828010773374374, + "flos": 500029369344.0, + "grad_norm": 0.0661010913051719, + "language_loss": 0.77266741, + "learning_rate": 0.00011869651000771959, + "loss": 0.78335881, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.28100586, + "step": 4069, + "time_per_iteration": 2.8502442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065751, + "balance_loss_mlp": 1.03747416, + "epoch": 0.782993459022701, + "flos": 600542539776.0, + "grad_norm": 0.06957531868653906, + "language_loss": 0.82841384, + "learning_rate": 0.00011849505912122117, + "loss": 0.83907133, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.28271484, + "step": 4070, + "time_per_iteration": 2.7242653369903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069163, + "balance_loss_mlp": 1.0401957, + "epoch": 0.7831858407079646, + "flos": 809702106624.0, + "grad_norm": 0.061542243963481506, + "language_loss": 0.77626544, + "learning_rate": 0.00011829375634807654, + "loss": 0.78695703, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.28955078, + "step": 4071, + "time_per_iteration": 3.18316650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_mlp": 1.03920245, + "epoch": 0.7833782223932282, + "flos": 806240701440.0, + "grad_norm": 0.06527363578820362, + "language_loss": 0.8108483, + "learning_rate": 0.00011809260176643821, + "loss": 0.82152736, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.28662109, + "step": 4072, + "time_per_iteration": 3.0564231872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071131, + "balance_loss_mlp": 1.04318857, + "epoch": 0.7835706040784918, + "flos": 520614508032.0, + "grad_norm": 0.0688544484419534, + "language_loss": 0.83763361, + "learning_rate": 0.00011789159545440131, + "loss": 0.84834492, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.27978516, + "step": 4073, + "time_per_iteration": 2.6478123664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070096, + "balance_loss_mlp": 1.04208159, + "epoch": 0.7837629857637552, + "flos": 505322592768.0, + "grad_norm": 0.05456504974378336, + "language_loss": 0.82081753, + "learning_rate": 0.00011769073749000348, + "loss": 0.83151847, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.2800293, + "step": 4074, + "time_per_iteration": 2.7911314964294434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069906, + "balance_loss_mlp": 1.041749, + "epoch": 0.7839553674490188, + "flos": 515872723968.0, + "grad_norm": 0.07358433801147621, + "language_loss": 0.76115894, + "learning_rate": 0.0001174900279512246, + "loss": 0.77185798, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.28149414, + "step": 4075, + "time_per_iteration": 2.593980312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070027, + "balance_loss_mlp": 1.04110718, + "epoch": 0.7841477491342824, + "flos": 506399330304.0, + "grad_norm": 0.055342987139179775, + "language_loss": 0.81843507, + "learning_rate": 0.00011728946691598707, + "loss": 0.82913536, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.2890625, + "step": 4076, + "time_per_iteration": 2.6213133335113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067587, + "balance_loss_mlp": 1.03902483, + "epoch": 0.784340130819546, + "flos": 719320048128.0, + "grad_norm": 0.06016705026128457, + "language_loss": 0.76231396, + "learning_rate": 0.00011708905446215561, + "loss": 0.77298987, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.28540039, + "step": 4077, + "time_per_iteration": 2.89338755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069715, + "balance_loss_mlp": 1.04110491, + "epoch": 0.7845325125048095, + "flos": 514174735872.0, + "grad_norm": 0.052498050136505506, + "language_loss": 0.80255234, + "learning_rate": 0.00011688879066753711, + "loss": 0.81324947, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.28564453, + "step": 4078, + "time_per_iteration": 2.691178560256958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067519, + "balance_loss_mlp": 1.04007649, + "epoch": 0.7847248941900731, + "flos": 465866422272.0, + "grad_norm": 0.06922222458803326, + "language_loss": 0.87530267, + "learning_rate": 0.00011668867560988122, + "loss": 0.88597786, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.2746582, + "step": 4079, + "time_per_iteration": 2.5730109214782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067247, + "balance_loss_mlp": 1.03870857, + "epoch": 0.7849172758753367, + "flos": 502766217216.0, + "grad_norm": 0.07036419305284744, + "language_loss": 0.84369481, + "learning_rate": 0.00011648870936687916, + "loss": 0.85436726, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.28540039, + "step": 4080, + "time_per_iteration": 2.763648271560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069029, + "balance_loss_mlp": 1.04053807, + "epoch": 0.7851096575606002, + "flos": 531742219776.0, + "grad_norm": 0.07246870648451295, + "language_loss": 0.78439957, + "learning_rate": 0.00011628889201616461, + "loss": 0.79508984, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.28515625, + "step": 4081, + "time_per_iteration": 2.6238608360290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070508, + "balance_loss_mlp": 1.04208827, + "epoch": 0.7853020392458638, + "flos": 569685256704.0, + "grad_norm": 0.05558757362509338, + "language_loss": 0.81841099, + "learning_rate": 0.00011608922363531393, + "loss": 0.82911611, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.28417969, + "step": 4082, + "time_per_iteration": 2.6667022705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.04639971, + "epoch": 0.7854944209311273, + "flos": 832228162560.0, + "grad_norm": 0.07344619623899691, + "language_loss": 0.83384395, + "learning_rate": 0.00011588970430184504, + "loss": 0.84458899, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.28100586, + "step": 4083, + "time_per_iteration": 3.0444436073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069927, + "balance_loss_mlp": 1.04212761, + "epoch": 0.7856868026163909, + "flos": 559660423680.0, + "grad_norm": 0.045313213286836455, + "language_loss": 0.81620705, + "learning_rate": 0.00011569033409321822, + "loss": 0.82690632, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.27807617, + "step": 4084, + "time_per_iteration": 2.7107021808624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074024, + "balance_loss_mlp": 1.04605722, + "epoch": 0.7858791843016545, + "flos": 544972382208.0, + "grad_norm": 0.06179602249028764, + "language_loss": 0.73075098, + "learning_rate": 0.00011549111308683591, + "loss": 0.7414912, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.2800293, + "step": 4085, + "time_per_iteration": 2.674802780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.04991984, + "epoch": 0.7860715659869181, + "flos": 380787761664.0, + "grad_norm": 0.06384285931580107, + "language_loss": 0.80674589, + "learning_rate": 0.00011529204136004251, + "loss": 0.8175236, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.27905273, + "step": 4086, + "time_per_iteration": 2.485450029373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073353, + "balance_loss_mlp": 1.04600596, + "epoch": 0.7862639476721817, + "flos": 567173961216.0, + "grad_norm": 0.056474664391545235, + "language_loss": 0.84569514, + "learning_rate": 0.00011509311899012459, + "loss": 0.85642868, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.27392578, + "step": 4087, + "time_per_iteration": 2.6641156673431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072601, + "balance_loss_mlp": 1.04475415, + "epoch": 0.7864563293574451, + "flos": 544968000000.0, + "grad_norm": 0.09344860836240211, + "language_loss": 0.78010523, + "learning_rate": 0.00011489434605431053, + "loss": 0.79083121, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.27880859, + "step": 4088, + "time_per_iteration": 2.646610736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071704, + "balance_loss_mlp": 1.04390407, + "epoch": 0.7866487110427087, + "flos": 563260041216.0, + "grad_norm": 0.06168893422677419, + "language_loss": 0.81236577, + "learning_rate": 0.0001146957226297708, + "loss": 0.8230828, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.27807617, + "step": 4089, + "time_per_iteration": 2.7216711044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106999, + "balance_loss_mlp": 1.04147482, + "epoch": 0.7868410927279723, + "flos": 727849124352.0, + "grad_norm": 0.05015677705021027, + "language_loss": 0.76367462, + "learning_rate": 0.00011449724879361827, + "loss": 0.77437449, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.28515625, + "step": 4090, + "time_per_iteration": 2.9962027072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070207, + "balance_loss_mlp": 1.04212117, + "epoch": 0.7870334744132359, + "flos": 521082989568.0, + "grad_norm": 0.07758144969638558, + "language_loss": 0.73733866, + "learning_rate": 0.00011429892462290687, + "loss": 0.74804068, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.28100586, + "step": 4091, + "time_per_iteration": 2.7208704948425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071413, + "balance_loss_mlp": 1.04413819, + "epoch": 0.7872258560984994, + "flos": 451173998592.0, + "grad_norm": 0.05584477685741542, + "language_loss": 0.83089757, + "learning_rate": 0.00011410075019463295, + "loss": 0.84161168, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.27319336, + "step": 4092, + "time_per_iteration": 2.608442544937134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070388, + "balance_loss_mlp": 1.04168272, + "epoch": 0.787418237783763, + "flos": 514932788736.0, + "grad_norm": 0.05394381148222231, + "language_loss": 0.79899406, + "learning_rate": 0.00011390272558573461, + "loss": 0.80969799, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.28710938, + "step": 4093, + "time_per_iteration": 2.6670477390289307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070092, + "balance_loss_mlp": 1.04183984, + "epoch": 0.7876106194690266, + "flos": 484837940736.0, + "grad_norm": 0.04973668631858953, + "language_loss": 0.79517233, + "learning_rate": 0.00011370485087309202, + "loss": 0.80587327, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.2824707, + "step": 4094, + "time_per_iteration": 2.651747703552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107091, + "balance_loss_mlp": 1.04229987, + "epoch": 0.7878030011542901, + "flos": 542570185728.0, + "grad_norm": 0.05872791575225344, + "language_loss": 0.78693342, + "learning_rate": 0.00011350712613352688, + "loss": 0.79764247, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.28613281, + "step": 4095, + "time_per_iteration": 2.6549277305603027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069142, + "balance_loss_mlp": 1.04072237, + "epoch": 0.7879953828395537, + "flos": 516488182272.0, + "grad_norm": 0.07961293490995022, + "language_loss": 0.79440165, + "learning_rate": 0.00011330955144380283, + "loss": 0.80509305, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.28417969, + "step": 4096, + "time_per_iteration": 2.6206085681915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_mlp": 1.04217863, + "epoch": 0.7881877645248172, + "flos": 582004597248.0, + "grad_norm": 0.06633225025055933, + "language_loss": 0.86351848, + "learning_rate": 0.00011311212688062483, + "loss": 0.87421972, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.27929688, + "step": 4097, + "time_per_iteration": 2.781184673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069633, + "balance_loss_mlp": 1.0408082, + "epoch": 0.7883801462100808, + "flos": 588883737600.0, + "grad_norm": 0.07192838384326647, + "language_loss": 0.77839339, + "learning_rate": 0.0001129148525206402, + "loss": 0.78908968, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.28808594, + "step": 4098, + "time_per_iteration": 2.8173389434814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067449, + "balance_loss_mlp": 1.03931606, + "epoch": 0.7885725278953444, + "flos": 481475460096.0, + "grad_norm": 0.11237603320949716, + "language_loss": 0.86339819, + "learning_rate": 0.00011271772844043759, + "loss": 0.87407273, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.28125, + "step": 4099, + "time_per_iteration": 2.7524821758270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069791, + "balance_loss_mlp": 1.04127622, + "epoch": 0.788764909580608, + "flos": 756470126592.0, + "grad_norm": 0.06946640589316219, + "language_loss": 0.75986981, + "learning_rate": 0.00011252075471654727, + "loss": 0.77056766, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.28515625, + "step": 4100, + "time_per_iteration": 2.947204351425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071355, + "balance_loss_mlp": 1.04262543, + "epoch": 0.7889572912658714, + "flos": 702225427968.0, + "grad_norm": 0.05611482280761958, + "language_loss": 0.7798807, + "learning_rate": 0.00011232393142544133, + "loss": 0.79059422, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.28710938, + "step": 4101, + "time_per_iteration": 2.95438551902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068821, + "balance_loss_mlp": 1.04037809, + "epoch": 0.789149672951135, + "flos": 736047931392.0, + "grad_norm": 0.06028554523946094, + "language_loss": 0.83136284, + "learning_rate": 0.00011212725864353323, + "loss": 0.84205109, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.28417969, + "step": 4102, + "time_per_iteration": 3.067315101623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015622, + "balance_loss_mlp": 1.00370073, + "epoch": 0.7893420546363986, + "flos": 1480626349056.0, + "grad_norm": 0.009770361918426226, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77351552, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.11914062, + "step": 4103, + "time_per_iteration": 4.903147220611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069345, + "balance_loss_mlp": 1.04016232, + "epoch": 0.7895344363216622, + "flos": 508821875712.0, + "grad_norm": 0.06690395183564687, + "language_loss": 0.75603718, + "learning_rate": 0.00011173436491267291, + "loss": 0.76673061, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.29150391, + "step": 4104, + "time_per_iteration": 2.607632637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064374, + "balance_loss_mlp": 1.0360018, + "epoch": 0.7897268180069258, + "flos": 541727764992.0, + "grad_norm": 0.055969758992029287, + "language_loss": 0.81935525, + "learning_rate": 0.0001115381441162554, + "loss": 0.82999897, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.28393555, + "step": 4105, + "time_per_iteration": 2.6217761039733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014508, + "balance_loss_mlp": 1.00268257, + "epoch": 0.7899191996921893, + "flos": 1411924953600.0, + "grad_norm": 0.0095479570502747, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74598229, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.11816406, + "step": 4106, + "time_per_iteration": 4.9060986042022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063636, + "balance_loss_mlp": 1.03524053, + "epoch": 0.7901115813774529, + "flos": 622547679744.0, + "grad_norm": 0.04917500811755106, + "language_loss": 0.84986818, + "learning_rate": 0.00011114615504234465, + "loss": 0.86050451, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.28393555, + "step": 4107, + "time_per_iteration": 2.760727882385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_mlp": 1.03931451, + "epoch": 0.7903039630627164, + "flos": 645232296960.0, + "grad_norm": 0.062643238447281, + "language_loss": 0.81024301, + "learning_rate": 0.00011095038691703468, + "loss": 0.82092702, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.29077148, + "step": 4108, + "time_per_iteration": 2.8416430950164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_mlp": 1.03758597, + "epoch": 0.79049634474798, + "flos": 594054715392.0, + "grad_norm": 0.059690498019966905, + "language_loss": 0.824301, + "learning_rate": 0.00011075476983417998, + "loss": 0.83495629, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.27978516, + "step": 4109, + "time_per_iteration": 2.879164695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.03742075, + "epoch": 0.7906887264332435, + "flos": 715784449536.0, + "grad_norm": 0.06625307097230863, + "language_loss": 0.77845091, + "learning_rate": 0.00011055930386972579, + "loss": 0.78911859, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.29272461, + "step": 4110, + "time_per_iteration": 2.8940486907958984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010668, + "balance_loss_mlp": 1.03761721, + "epoch": 0.7908811081185071, + "flos": 789553516032.0, + "grad_norm": 0.05640022184839657, + "language_loss": 0.78389466, + "learning_rate": 0.00011036398909955863, + "loss": 0.79456264, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.29150391, + "step": 4111, + "time_per_iteration": 2.9704418182373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03795147, + "epoch": 0.7910734898037707, + "flos": 641612330496.0, + "grad_norm": 0.05533152430131226, + "language_loss": 0.81315625, + "learning_rate": 0.00011016882559950648, + "loss": 0.82381761, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.28173828, + "step": 4112, + "time_per_iteration": 2.8546900749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064394, + "balance_loss_mlp": 1.03561699, + "epoch": 0.7912658714890343, + "flos": 669057670656.0, + "grad_norm": 0.06990273723133285, + "language_loss": 0.80328232, + "learning_rate": 0.00010997381344533853, + "loss": 0.81392628, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.28759766, + "step": 4113, + "time_per_iteration": 2.7969515323638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069119, + "balance_loss_mlp": 1.04031801, + "epoch": 0.7914582531742979, + "flos": 557504128512.0, + "grad_norm": 0.061948681643476444, + "language_loss": 0.80212009, + "learning_rate": 0.00010977895271276517, + "loss": 0.81281132, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.28808594, + "step": 4114, + "time_per_iteration": 2.7396297454833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064232, + "balance_loss_mlp": 1.03552604, + "epoch": 0.7916506348595613, + "flos": 569784181248.0, + "grad_norm": 0.06188955891536592, + "language_loss": 0.80402255, + "learning_rate": 0.00010958424347743807, + "loss": 0.8146649, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.28710938, + "step": 4115, + "time_per_iteration": 2.7420108318328857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071293, + "balance_loss_mlp": 1.04337442, + "epoch": 0.7918430165448249, + "flos": 717966885888.0, + "grad_norm": 0.07461075198544243, + "language_loss": 0.80391407, + "learning_rate": 0.00010938968581494991, + "loss": 0.81462699, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.27929688, + "step": 4116, + "time_per_iteration": 2.941556692123413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072505, + "balance_loss_mlp": 1.04418087, + "epoch": 0.7920353982300885, + "flos": 553377802752.0, + "grad_norm": 0.12071106309265658, + "language_loss": 0.78737396, + "learning_rate": 0.000109195279800835, + "loss": 0.79809904, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.28344727, + "step": 4117, + "time_per_iteration": 2.7312655448913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.03901899, + "epoch": 0.7922277799153521, + "flos": 809766125568.0, + "grad_norm": 0.06211546650741466, + "language_loss": 0.76734632, + "learning_rate": 0.00010900102551056834, + "loss": 0.77802026, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.28344727, + "step": 4118, + "time_per_iteration": 3.061748504638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.04590917, + "epoch": 0.7924201616006156, + "flos": 421128612864.0, + "grad_norm": 0.05658815463494319, + "language_loss": 0.84763014, + "learning_rate": 0.00010880692301956601, + "loss": 0.85836887, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.27978516, + "step": 4119, + "time_per_iteration": 2.504396677017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.04241323, + "epoch": 0.7926125432858792, + "flos": 617541055488.0, + "grad_norm": 0.052435339334051444, + "language_loss": 0.85989153, + "learning_rate": 0.00010861297240318518, + "loss": 0.87059963, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.28393555, + "step": 4120, + "time_per_iteration": 2.851905584335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107072, + "balance_loss_mlp": 1.04296827, + "epoch": 0.7928049249711427, + "flos": 602207032320.0, + "grad_norm": 0.06531293240023527, + "language_loss": 0.86884111, + "learning_rate": 0.00010841917373672444, + "loss": 0.87954831, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.27783203, + "step": 4121, + "time_per_iteration": 2.72057843208313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074221, + "balance_loss_mlp": 1.04561055, + "epoch": 0.7929973066564063, + "flos": 655724201472.0, + "grad_norm": 0.0659209843425975, + "language_loss": 0.78515911, + "learning_rate": 0.00010822552709542293, + "loss": 0.7959013, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.28588867, + "step": 4122, + "time_per_iteration": 2.8345208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068379, + "balance_loss_mlp": 1.04067445, + "epoch": 0.7931896883416699, + "flos": 536139177984.0, + "grad_norm": 0.053977644004353675, + "language_loss": 0.86079139, + "learning_rate": 0.0001080320325544612, + "loss": 0.87147516, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.27734375, + "step": 4123, + "time_per_iteration": 2.734748601913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073545, + "balance_loss_mlp": 1.04591262, + "epoch": 0.7933820700269334, + "flos": 497836758528.0, + "grad_norm": 0.05342076952837262, + "language_loss": 0.82945108, + "learning_rate": 0.00010783869018895997, + "loss": 0.84018654, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.27661133, + "step": 4124, + "time_per_iteration": 2.5848159790039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071922, + "balance_loss_mlp": 1.04438472, + "epoch": 0.793574451712197, + "flos": 537217325568.0, + "grad_norm": 0.05760976665940277, + "language_loss": 0.84397703, + "learning_rate": 0.00010764550007398189, + "loss": 0.85469627, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.27563477, + "step": 4125, + "time_per_iteration": 2.613123655319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076104, + "balance_loss_mlp": 1.04797053, + "epoch": 0.7937668333974606, + "flos": 488043270144.0, + "grad_norm": 0.05267738869669298, + "language_loss": 0.81016707, + "learning_rate": 0.00010745246228452982, + "loss": 0.82092816, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.28173828, + "step": 4126, + "time_per_iteration": 2.5770304203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072331, + "balance_loss_mlp": 1.04460263, + "epoch": 0.7939592150827242, + "flos": 527163379200.0, + "grad_norm": 0.053184738741740976, + "language_loss": 0.8170619, + "learning_rate": 0.00010725957689554771, + "loss": 0.82778513, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.27734375, + "step": 4127, + "time_per_iteration": 2.774044990539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073736, + "balance_loss_mlp": 1.04579329, + "epoch": 0.7941515967679876, + "flos": 541428019200.0, + "grad_norm": 0.047011204892956564, + "language_loss": 0.84647489, + "learning_rate": 0.00010706684398192013, + "loss": 0.85721219, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.27978516, + "step": 4128, + "time_per_iteration": 2.74668025970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070127, + "balance_loss_mlp": 1.0423516, + "epoch": 0.7943439784532512, + "flos": 518104622592.0, + "grad_norm": 0.061789852182866596, + "language_loss": 0.82038182, + "learning_rate": 0.00010687426361847313, + "loss": 0.83108312, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.27807617, + "step": 4129, + "time_per_iteration": 2.7684710025787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075571, + "balance_loss_mlp": 1.04777122, + "epoch": 0.7945363601385148, + "flos": 508768031232.0, + "grad_norm": 0.056918102150188964, + "language_loss": 0.85627353, + "learning_rate": 0.00010668183587997254, + "loss": 0.86702919, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.27807617, + "step": 4130, + "time_per_iteration": 2.6196768283843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071124, + "balance_loss_mlp": 1.04289508, + "epoch": 0.7947287418237784, + "flos": 650918398464.0, + "grad_norm": 0.052989144266830976, + "language_loss": 0.77423567, + "learning_rate": 0.0001064895608411256, + "loss": 0.78494692, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.28222656, + "step": 4131, + "time_per_iteration": 2.822084903717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_mlp": 1.04275465, + "epoch": 0.794921123509042, + "flos": 695726019072.0, + "grad_norm": 0.05398038812171178, + "language_loss": 0.80283594, + "learning_rate": 0.00010629743857657998, + "loss": 0.81354314, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.27954102, + "step": 4132, + "time_per_iteration": 2.9548959732055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018993, + "balance_loss_mlp": 1.00807393, + "epoch": 0.7951135051943055, + "flos": 1402161988608.0, + "grad_norm": 0.012201686903541073, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71617663, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.109375, + "step": 4133, + "time_per_iteration": 4.596825122833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077524, + "balance_loss_mlp": 1.04950953, + "epoch": 0.795305886879569, + "flos": 809745776640.0, + "grad_norm": 0.1291273106507343, + "language_loss": 0.82121062, + "learning_rate": 0.00010591365266868802, + "loss": 0.83198583, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.28027344, + "step": 4134, + "time_per_iteration": 2.997457981109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019784, + "balance_loss_mlp": 1.00886476, + "epoch": 0.7954982685648326, + "flos": 1425205988352.0, + "grad_norm": 0.01121858900173578, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76531565, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.109375, + "step": 4135, + "time_per_iteration": 4.933257818222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070322, + "balance_loss_mlp": 1.0421412, + "epoch": 0.7956906502500962, + "flos": 389670428160.0, + "grad_norm": 0.07786925051397248, + "language_loss": 0.78780544, + "learning_rate": 0.00010553047875229166, + "loss": 0.7985087, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.28198242, + "step": 4136, + "time_per_iteration": 2.5145680904388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072473, + "balance_loss_mlp": 1.04522216, + "epoch": 0.7958830319353598, + "flos": 515321284608.0, + "grad_norm": 0.08712242528713769, + "language_loss": 0.83510804, + "learning_rate": 0.00010533912147689328, + "loss": 0.84583282, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.27270508, + "step": 4137, + "time_per_iteration": 2.6298136711120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076228, + "balance_loss_mlp": 1.04814243, + "epoch": 0.7960754136206233, + "flos": 493695876096.0, + "grad_norm": 0.06714788693393858, + "language_loss": 0.82280171, + "learning_rate": 0.00010514791742243656, + "loss": 0.83356392, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.28100586, + "step": 4138, + "time_per_iteration": 2.5997424125671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073026, + "balance_loss_mlp": 1.04553676, + "epoch": 0.7962677953058869, + "flos": 655409899008.0, + "grad_norm": 0.06696972519058896, + "language_loss": 0.82444674, + "learning_rate": 0.00010495686666315341, + "loss": 0.83517706, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.27514648, + "step": 4139, + "time_per_iteration": 2.8953542709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074691, + "balance_loss_mlp": 1.04662871, + "epoch": 0.7964601769911505, + "flos": 542126435328.0, + "grad_norm": 0.07236671578874358, + "language_loss": 0.77130395, + "learning_rate": 0.00010476596927321635, + "loss": 0.78205085, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.28076172, + "step": 4140, + "time_per_iteration": 2.6313490867614746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107407, + "balance_loss_mlp": 1.04591274, + "epoch": 0.796652558676414, + "flos": 537356947968.0, + "grad_norm": 0.07734927138109192, + "language_loss": 0.80230534, + "learning_rate": 0.00010457522532673835, + "loss": 0.81304598, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.28173828, + "step": 4141, + "time_per_iteration": 2.8211119174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073839, + "balance_loss_mlp": 1.0459199, + "epoch": 0.7968449403616775, + "flos": 474852395520.0, + "grad_norm": 0.05569229872202348, + "language_loss": 0.83232534, + "learning_rate": 0.00010438463489777272, + "loss": 0.84306371, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.27954102, + "step": 4142, + "time_per_iteration": 2.6115970611572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074531, + "balance_loss_mlp": 1.04665971, + "epoch": 0.7970373220469411, + "flos": 567336904704.0, + "grad_norm": 0.06331690376736109, + "language_loss": 0.77703011, + "learning_rate": 0.00010419419806031316, + "loss": 0.7877754, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.27880859, + "step": 4143, + "time_per_iteration": 2.7046220302581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074634, + "balance_loss_mlp": 1.04664397, + "epoch": 0.7972297037322047, + "flos": 555924003840.0, + "grad_norm": 0.04909390704775502, + "language_loss": 0.83792174, + "learning_rate": 0.00010400391488829403, + "loss": 0.8486681, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.2800293, + "step": 4144, + "time_per_iteration": 2.790830612182617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076091, + "balance_loss_mlp": 1.04788637, + "epoch": 0.7974220854174683, + "flos": 575899476480.0, + "grad_norm": 0.05483263194538034, + "language_loss": 0.86199546, + "learning_rate": 0.00010381378545558984, + "loss": 0.87275642, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.2824707, + "step": 4145, + "time_per_iteration": 2.7284913063049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069763, + "balance_loss_mlp": 1.04203475, + "epoch": 0.7976144671027319, + "flos": 482824240128.0, + "grad_norm": 0.05322555202635646, + "language_loss": 0.84398592, + "learning_rate": 0.00010362380983601505, + "loss": 0.85468352, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.27758789, + "step": 4146, + "time_per_iteration": 2.546143054962158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.03938699, + "epoch": 0.7978068487879953, + "flos": 1077420372480.0, + "grad_norm": 0.05187096482218071, + "language_loss": 0.78898019, + "learning_rate": 0.00010343398810332477, + "loss": 0.79965299, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.27905273, + "step": 4147, + "time_per_iteration": 3.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072596, + "balance_loss_mlp": 1.04465318, + "epoch": 0.7979992304732589, + "flos": 733421744640.0, + "grad_norm": 0.0650162065800976, + "language_loss": 0.84200764, + "learning_rate": 0.00010324432033121467, + "loss": 0.85273361, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.2800293, + "step": 4148, + "time_per_iteration": 2.9164648056030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070872, + "balance_loss_mlp": 1.04207134, + "epoch": 0.7981916121585225, + "flos": 415531261440.0, + "grad_norm": 0.06518493190513895, + "language_loss": 0.83341253, + "learning_rate": 0.00010305480659332005, + "loss": 0.84412122, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.28808594, + "step": 4149, + "time_per_iteration": 2.6469006538391113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.04290879, + "epoch": 0.7983839938437861, + "flos": 465019619328.0, + "grad_norm": 0.06242001263980543, + "language_loss": 0.83330691, + "learning_rate": 0.00010286544696321682, + "loss": 0.84401828, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.28222656, + "step": 4150, + "time_per_iteration": 2.5429742336273193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.04543519, + "epoch": 0.7985763755290496, + "flos": 510304485888.0, + "grad_norm": 0.06754113423442079, + "language_loss": 0.79446447, + "learning_rate": 0.00010267624151442073, + "loss": 0.80520344, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.28417969, + "step": 4151, + "time_per_iteration": 2.6111056804656982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107675, + "balance_loss_mlp": 1.04852068, + "epoch": 0.7987687572143132, + "flos": 1010243847168.0, + "grad_norm": 0.0631421524171095, + "language_loss": 0.80901897, + "learning_rate": 0.000102487190320388, + "loss": 0.81978643, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.2824707, + "step": 4152, + "time_per_iteration": 3.323118209838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068338, + "balance_loss_mlp": 1.04015708, + "epoch": 0.7989611388995768, + "flos": 1020662968320.0, + "grad_norm": 0.0589010586848655, + "language_loss": 0.79593813, + "learning_rate": 0.00010229829345451475, + "loss": 0.80662155, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.28198242, + "step": 4153, + "time_per_iteration": 3.364107370376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071047, + "balance_loss_mlp": 1.04329467, + "epoch": 0.7991535205848403, + "flos": 1100915476992.0, + "grad_norm": 0.06516359919102382, + "language_loss": 0.79660934, + "learning_rate": 0.00010210955099013724, + "loss": 0.80731982, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.27758789, + "step": 4154, + "time_per_iteration": 3.413896322250366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070605, + "balance_loss_mlp": 1.04247141, + "epoch": 0.7993459022701039, + "flos": 834454268928.0, + "grad_norm": 0.06322395894070157, + "language_loss": 0.76450896, + "learning_rate": 0.00010192096300053167, + "loss": 0.77521503, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.28149414, + "step": 4155, + "time_per_iteration": 3.1282687187194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069737, + "balance_loss_mlp": 1.04179418, + "epoch": 0.7995382839553674, + "flos": 522417212928.0, + "grad_norm": 0.4084707213419165, + "language_loss": 0.8520155, + "learning_rate": 0.00010173252955891477, + "loss": 0.8627128, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.27929688, + "step": 4156, + "time_per_iteration": 2.78415584564209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074464, + "balance_loss_mlp": 1.04685545, + "epoch": 0.799730665640631, + "flos": 537562151424.0, + "grad_norm": 0.06643949206963136, + "language_loss": 0.72880185, + "learning_rate": 0.00010154425073844253, + "loss": 0.73954648, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.27612305, + "step": 4157, + "time_per_iteration": 2.73618221282959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068843, + "balance_loss_mlp": 1.04032815, + "epoch": 0.7999230473258946, + "flos": 504809031168.0, + "grad_norm": 0.05290023006148714, + "language_loss": 0.82135558, + "learning_rate": 0.00010135612661221138, + "loss": 0.83204401, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.28515625, + "step": 4158, + "time_per_iteration": 2.554800510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068433, + "balance_loss_mlp": 1.04008496, + "epoch": 0.8001154290111582, + "flos": 1026935414784.0, + "grad_norm": 0.060322834717302515, + "language_loss": 0.81768221, + "learning_rate": 0.00010116815725325751, + "loss": 0.82836652, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.28344727, + "step": 4159, + "time_per_iteration": 3.2874691486358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077912, + "balance_loss_mlp": 1.04949212, + "epoch": 0.8003078106964217, + "flos": 750567237120.0, + "grad_norm": 0.0534649619029418, + "language_loss": 0.80202901, + "learning_rate": 0.00010098034273455725, + "loss": 0.8128081, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.28417969, + "step": 4160, + "time_per_iteration": 2.9733405113220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071823, + "balance_loss_mlp": 1.04402316, + "epoch": 0.8005001923816852, + "flos": 488201831424.0, + "grad_norm": 0.059729691811872904, + "language_loss": 0.79879338, + "learning_rate": 0.00010079268312902662, + "loss": 0.80951154, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.27832031, + "step": 4161, + "time_per_iteration": 2.668815851211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075695, + "balance_loss_mlp": 1.04875386, + "epoch": 0.8006925740669488, + "flos": 512983107072.0, + "grad_norm": 0.06045129484574589, + "language_loss": 0.81970763, + "learning_rate": 0.0001006051785095215, + "loss": 0.8304646, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.26977539, + "step": 4162, + "time_per_iteration": 2.653640031814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.03988147, + "epoch": 0.8008849557522124, + "flos": 578243446272.0, + "grad_norm": 0.0602464092340954, + "language_loss": 0.79306024, + "learning_rate": 0.0001004178289488376, + "loss": 0.80373633, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.27783203, + "step": 4163, + "time_per_iteration": 2.732161283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069798, + "balance_loss_mlp": 1.04183149, + "epoch": 0.801077337437476, + "flos": 478466569728.0, + "grad_norm": 0.05584875383121944, + "language_loss": 0.83879602, + "learning_rate": 0.0001002306345197106, + "loss": 0.84949404, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.2800293, + "step": 4164, + "time_per_iteration": 2.541621685028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072063, + "balance_loss_mlp": 1.04419172, + "epoch": 0.8012697191227395, + "flos": 676384943616.0, + "grad_norm": 0.06393234311197828, + "language_loss": 0.79935479, + "learning_rate": 0.00010004359529481571, + "loss": 0.8100754, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.27880859, + "step": 4165, + "time_per_iteration": 2.879521369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076388, + "balance_loss_mlp": 1.04808736, + "epoch": 0.8014621008080031, + "flos": 1294624567296.0, + "grad_norm": 0.05702716092084167, + "language_loss": 0.82164598, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83240986, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.28320312, + "step": 4166, + "time_per_iteration": 3.7128407955169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074533, + "balance_loss_mlp": 1.04687643, + "epoch": 0.8016544824932667, + "flos": 511579072512.0, + "grad_norm": 0.07676481935953286, + "language_loss": 0.82921106, + "learning_rate": 9.966998274812234e-05, + "loss": 0.8399564, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.27685547, + "step": 4167, + "time_per_iteration": 2.6149368286132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074371, + "balance_loss_mlp": 1.04676175, + "epoch": 0.8018468641785302, + "flos": 535434969600.0, + "grad_norm": 0.07175891193928671, + "language_loss": 0.8114351, + "learning_rate": 9.948340957137308e-05, + "loss": 0.82217884, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.27636719, + "step": 4168, + "time_per_iteration": 2.6559274196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079124, + "balance_loss_mlp": 1.05177772, + "epoch": 0.8020392458637937, + "flos": 1023025876992.0, + "grad_norm": 0.0825865132585856, + "language_loss": 0.7948184, + "learning_rate": 9.929699188895447e-05, + "loss": 0.80560958, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.27416992, + "step": 4169, + "time_per_iteration": 3.292508363723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036816, + "balance_loss_mlp": 1.02546716, + "epoch": 0.8022316275490573, + "flos": 1560993748992.0, + "grad_norm": 0.019591021786405507, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79091221, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.11328125, + "step": 4170, + "time_per_iteration": 4.99972677230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079157, + "balance_loss_mlp": 1.05030823, + "epoch": 0.8024240092343209, + "flos": 420473866752.0, + "grad_norm": 0.06556949465152317, + "language_loss": 0.83036101, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84115261, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.28833008, + "step": 4171, + "time_per_iteration": 2.4891555309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077075, + "balance_loss_mlp": 1.04839337, + "epoch": 0.8026163909195845, + "flos": 763506418176.0, + "grad_norm": 0.06284126709301016, + "language_loss": 0.78710306, + "learning_rate": 9.873867253111762e-05, + "loss": 0.7978738, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.28686523, + "step": 4172, + "time_per_iteration": 2.9779157638549805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_mlp": 1.02285993, + "epoch": 0.8028087726048481, + "flos": 1518044087808.0, + "grad_norm": 0.018943841707913467, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81298721, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.11279297, + "step": 4173, + "time_per_iteration": 4.92714524269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079527, + "balance_loss_mlp": 1.051108, + "epoch": 0.8030011542901115, + "flos": 517620174336.0, + "grad_norm": 0.07028962600154551, + "language_loss": 0.8832283, + "learning_rate": 9.836723842278733e-05, + "loss": 0.8940236, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.28417969, + "step": 4174, + "time_per_iteration": 2.6065914630889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075163, + "balance_loss_mlp": 1.04717231, + "epoch": 0.8031935359753751, + "flos": 545356495872.0, + "grad_norm": 0.06309539904613753, + "language_loss": 0.7796675, + "learning_rate": 9.818175522408646e-05, + "loss": 0.7904191, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.27966309, + "step": 4175, + "time_per_iteration": 2.6612541675567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075178, + "balance_loss_mlp": 1.04694939, + "epoch": 0.8033859176606387, + "flos": 603266241024.0, + "grad_norm": 0.047657193754151006, + "language_loss": 0.84480703, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85555875, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.28222656, + "step": 4176, + "time_per_iteration": 2.7781217098236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_mlp": 1.0484159, + "epoch": 0.8035782993459023, + "flos": 565579279872.0, + "grad_norm": 0.07387261504337528, + "language_loss": 0.81488836, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82565117, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.27905273, + "step": 4177, + "time_per_iteration": 2.6964521408081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073463, + "balance_loss_mlp": 1.04540133, + "epoch": 0.8037706810311658, + "flos": 538177609728.0, + "grad_norm": 0.057863226460369684, + "language_loss": 0.84295249, + "learning_rate": 9.762624191379054e-05, + "loss": 0.85368717, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.28051758, + "step": 4178, + "time_per_iteration": 2.618422269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070523, + "balance_loss_mlp": 1.04231787, + "epoch": 0.8039630627164294, + "flos": 514937170944.0, + "grad_norm": 0.05803558735521543, + "language_loss": 0.79554057, + "learning_rate": 9.744138314526014e-05, + "loss": 0.8062458, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.28222656, + "step": 4179, + "time_per_iteration": 2.637068510055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023515, + "balance_loss_mlp": 1.01240516, + "epoch": 0.804155444401693, + "flos": 1478061209088.0, + "grad_norm": 0.008294306940635323, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.7575708, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.11132812, + "step": 4180, + "time_per_iteration": 4.895167827606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070457, + "balance_loss_mlp": 1.04163229, + "epoch": 0.8043478260869565, + "flos": 520909871616.0, + "grad_norm": 0.06869839727522731, + "language_loss": 0.7746588, + "learning_rate": 9.707213454125396e-05, + "loss": 0.78536338, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.2878418, + "step": 4181, + "time_per_iteration": 2.636059045791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071714, + "balance_loss_mlp": 1.04296076, + "epoch": 0.8045402077722201, + "flos": 545170231296.0, + "grad_norm": 0.061080671459635506, + "language_loss": 0.80578196, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81649911, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.28710938, + "step": 4182, + "time_per_iteration": 2.781472682952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073737, + "balance_loss_mlp": 1.04460168, + "epoch": 0.8047325894574836, + "flos": 678059610624.0, + "grad_norm": 0.06915536366998667, + "language_loss": 0.73871112, + "learning_rate": 9.670351165906921e-05, + "loss": 0.74944854, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.29150391, + "step": 4183, + "time_per_iteration": 2.9433372020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069336, + "balance_loss_mlp": 1.04103541, + "epoch": 0.8049249711427472, + "flos": 586952994816.0, + "grad_norm": 0.057442229187810216, + "language_loss": 0.78591096, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79660439, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.28320312, + "step": 4184, + "time_per_iteration": 2.753244400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.03734207, + "epoch": 0.8051173528280108, + "flos": 813824050176.0, + "grad_norm": 0.055690130588938895, + "language_loss": 0.77644128, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78709412, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.27978516, + "step": 4185, + "time_per_iteration": 3.116245746612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071248, + "balance_loss_mlp": 1.04249442, + "epoch": 0.8053097345132744, + "flos": 725371324416.0, + "grad_norm": 0.05368141398175553, + "language_loss": 0.77715063, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78786314, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.28735352, + "step": 4186, + "time_per_iteration": 2.9494264125823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067801, + "balance_loss_mlp": 1.03900027, + "epoch": 0.805502116198538, + "flos": 747706733568.0, + "grad_norm": 0.07244263091625658, + "language_loss": 0.81652725, + "learning_rate": 9.596814534898552e-05, + "loss": 0.8272053, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.2878418, + "step": 4187, + "time_per_iteration": 2.979442596435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061733, + "balance_loss_mlp": 1.03312325, + "epoch": 0.8056944978838014, + "flos": 639953630208.0, + "grad_norm": 0.06907450450610357, + "language_loss": 0.87470937, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88532674, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.28637695, + "step": 4188, + "time_per_iteration": 2.804840564727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069501, + "balance_loss_mlp": 1.04017591, + "epoch": 0.805886879569065, + "flos": 644344796160.0, + "grad_norm": 0.0767121628935675, + "language_loss": 0.78102624, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79172122, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.29296875, + "step": 4189, + "time_per_iteration": 2.763796329498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070889, + "balance_loss_mlp": 1.04175389, + "epoch": 0.8060792612543286, + "flos": 660928674816.0, + "grad_norm": 0.06679116415647134, + "language_loss": 0.81191343, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8226223, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.29125977, + "step": 4190, + "time_per_iteration": 2.810873031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106476, + "balance_loss_mlp": 1.03603029, + "epoch": 0.8062716429395922, + "flos": 454842017280.0, + "grad_norm": 0.06652333597663049, + "language_loss": 0.8252098, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83585739, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.28735352, + "step": 4191, + "time_per_iteration": 2.5331108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.04260826, + "epoch": 0.8064640246248557, + "flos": 526153632768.0, + "grad_norm": 0.07127869186789165, + "language_loss": 0.85161996, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86233693, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.29052734, + "step": 4192, + "time_per_iteration": 2.652766466140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066547, + "balance_loss_mlp": 1.03776956, + "epoch": 0.8066564063101193, + "flos": 864726617088.0, + "grad_norm": 0.048096998874408305, + "language_loss": 0.82061756, + "learning_rate": 9.486980307710208e-05, + "loss": 0.83128297, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.28735352, + "step": 4193, + "time_per_iteration": 3.1492722034454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064872, + "balance_loss_mlp": 1.03559446, + "epoch": 0.8068487879953828, + "flos": 530261019648.0, + "grad_norm": 0.05222546458111616, + "language_loss": 0.8172397, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82788843, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.29272461, + "step": 4194, + "time_per_iteration": 2.7544384002685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067388, + "balance_loss_mlp": 1.0379194, + "epoch": 0.8070411696806464, + "flos": 565918313472.0, + "grad_norm": 0.04982276198281567, + "language_loss": 0.81616491, + "learning_rate": 9.450494651319003e-05, + "loss": 0.82683873, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.29443359, + "step": 4195, + "time_per_iteration": 2.707900285720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.03467226, + "epoch": 0.80723355136591, + "flos": 986176954368.0, + "grad_norm": 0.04761294147814613, + "language_loss": 0.79018849, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80082393, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.28857422, + "step": 4196, + "time_per_iteration": 3.3168561458587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066531, + "balance_loss_mlp": 1.0380404, + "epoch": 0.8074259330511735, + "flos": 566682158592.0, + "grad_norm": 0.05760757559429525, + "language_loss": 0.82881331, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83947861, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.28491211, + "step": 4197, + "time_per_iteration": 2.8094139099121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_mlp": 1.03605068, + "epoch": 0.8076183147364371, + "flos": 494391320064.0, + "grad_norm": 0.05415863808022291, + "language_loss": 0.79741108, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80806035, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.28881836, + "step": 4198, + "time_per_iteration": 2.7344775199890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065792, + "balance_loss_mlp": 1.03694367, + "epoch": 0.8078106964217007, + "flos": 419798771712.0, + "grad_norm": 0.0525166714503648, + "language_loss": 0.79778922, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80844712, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.28808594, + "step": 4199, + "time_per_iteration": 2.481445550918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065581, + "balance_loss_mlp": 1.03642273, + "epoch": 0.8080030781069643, + "flos": 527281242624.0, + "grad_norm": 0.12008878488060483, + "language_loss": 0.82967323, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84032905, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.29125977, + "step": 4200, + "time_per_iteration": 2.603832960128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_mlp": 1.04081631, + "epoch": 0.8081954597922277, + "flos": 543898616832.0, + "grad_norm": 0.05544324871835158, + "language_loss": 0.81466305, + "learning_rate": 9.341415733398733e-05, + "loss": 0.8253628, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.29150391, + "step": 4201, + "time_per_iteration": 2.6372344493865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066578, + "balance_loss_mlp": 1.03768134, + "epoch": 0.8083878414774913, + "flos": 640593819648.0, + "grad_norm": 0.06923511846840386, + "language_loss": 0.75673985, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76740557, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.28857422, + "step": 4202, + "time_per_iteration": 2.844560384750366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065127, + "balance_loss_mlp": 1.03606391, + "epoch": 0.8085802231627549, + "flos": 705292545024.0, + "grad_norm": 0.06954281652768038, + "language_loss": 0.72733068, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73798198, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.2902832, + "step": 4203, + "time_per_iteration": 2.8883166313171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_mlp": 1.0347656, + "epoch": 0.8087726048480185, + "flos": 419538313728.0, + "grad_norm": 0.06243903224540148, + "language_loss": 0.88454056, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89518553, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.296875, + "step": 4204, + "time_per_iteration": 2.5257723331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067541, + "balance_loss_mlp": 1.03912127, + "epoch": 0.8089649865332821, + "flos": 508493016576.0, + "grad_norm": 0.055666133519853146, + "language_loss": 0.87257159, + "learning_rate": 9.269012061893922e-05, + "loss": 0.88324702, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.28417969, + "step": 4205, + "time_per_iteration": 2.764925956726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.03842449, + "epoch": 0.8091573682185456, + "flos": 456960434688.0, + "grad_norm": 0.058789121979447925, + "language_loss": 0.84584945, + "learning_rate": 9.250950659394386e-05, + "loss": 0.85652483, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.29077148, + "step": 4206, + "time_per_iteration": 2.687206506729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03734708, + "epoch": 0.8093497499038091, + "flos": 524977970688.0, + "grad_norm": 0.05245178609019049, + "language_loss": 0.76937735, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78003788, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.28686523, + "step": 4207, + "time_per_iteration": 2.723975896835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068651, + "balance_loss_mlp": 1.04037452, + "epoch": 0.8095421315890727, + "flos": 489377493504.0, + "grad_norm": 0.07111499051035935, + "language_loss": 0.76618123, + "learning_rate": 9.214875321953164e-05, + "loss": 0.77686775, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.28271484, + "step": 4208, + "time_per_iteration": 2.615595817565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_mlp": 1.04001641, + "epoch": 0.8097345132743363, + "flos": 624817456128.0, + "grad_norm": 0.05731599003072511, + "language_loss": 0.8059206, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81661069, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.28930664, + "step": 4209, + "time_per_iteration": 2.8043084144592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_mlp": 1.0376935, + "epoch": 0.8099268949595998, + "flos": 615393524736.0, + "grad_norm": 0.06359903235103676, + "language_loss": 0.79155213, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80222422, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.29467773, + "step": 4210, + "time_per_iteration": 2.79875111579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065017, + "balance_loss_mlp": 1.03633547, + "epoch": 0.8101192766448634, + "flos": 479383183872.0, + "grad_norm": 0.056055581706419104, + "language_loss": 0.79616201, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80681217, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.28686523, + "step": 4211, + "time_per_iteration": 2.6358375549316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065264, + "balance_loss_mlp": 1.03648686, + "epoch": 0.810311658330127, + "flos": 517078909440.0, + "grad_norm": 0.05344256107518821, + "language_loss": 0.86847901, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87913167, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.28759766, + "step": 4212, + "time_per_iteration": 2.6177706718444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069065, + "balance_loss_mlp": 1.04055011, + "epoch": 0.8105040400153906, + "flos": 575481867264.0, + "grad_norm": 0.04499674927197359, + "language_loss": 0.8394531, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85014379, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.28515625, + "step": 4213, + "time_per_iteration": 2.7992186546325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064046, + "balance_loss_mlp": 1.0357455, + "epoch": 0.8106964217006541, + "flos": 638658694656.0, + "grad_norm": 0.0596272682353905, + "language_loss": 0.84905821, + "learning_rate": 9.107029553743862e-05, + "loss": 0.85969865, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.28320312, + "step": 4214, + "time_per_iteration": 2.8410491943359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072599, + "balance_loss_mlp": 1.04463267, + "epoch": 0.8108888033859176, + "flos": 579237225984.0, + "grad_norm": 0.07027285717141396, + "language_loss": 0.81110525, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82183123, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.2800293, + "step": 4215, + "time_per_iteration": 2.6683573722839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_mlp": 1.03414786, + "epoch": 0.8110811850711812, + "flos": 559612371456.0, + "grad_norm": 0.052081038567736, + "language_loss": 0.83540303, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84602392, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.27978516, + "step": 4216, + "time_per_iteration": 2.7824838161468506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019607, + "balance_loss_mlp": 1.00854468, + "epoch": 0.8112735667564448, + "flos": 1517160969216.0, + "grad_norm": 0.011434458590002855, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78280026, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.11083984, + "step": 4217, + "time_per_iteration": 4.637202978134155 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065471, + "balance_loss_mlp": 1.03657508, + "epoch": 0.8114659484417084, + "flos": 616048270848.0, + "grad_norm": 0.07350914645250498, + "language_loss": 0.85149193, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86214668, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.2890625, + "step": 4218, + "time_per_iteration": 2.782702684402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.0393765, + "epoch": 0.8116583301269719, + "flos": 649624872960.0, + "grad_norm": 0.048207191207865485, + "language_loss": 0.7901873, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80086124, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.27990723, + "step": 4219, + "time_per_iteration": 2.930854558944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.03798532, + "epoch": 0.8118507118122354, + "flos": 552811806720.0, + "grad_norm": 0.059588246465710766, + "language_loss": 0.80647886, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81714863, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.28979492, + "step": 4220, + "time_per_iteration": 2.711641550064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067087, + "balance_loss_mlp": 1.03857219, + "epoch": 0.812043093497499, + "flos": 543854946816.0, + "grad_norm": 0.050033964999099186, + "language_loss": 0.87859094, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88926178, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.28540039, + "step": 4221, + "time_per_iteration": 2.6416759490966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066094, + "balance_loss_mlp": 1.0373888, + "epoch": 0.8122354751827626, + "flos": 583113268224.0, + "grad_norm": 0.05826144530446981, + "language_loss": 0.83350205, + "learning_rate": 8.964124513805628e-05, + "loss": 0.844163, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.28735352, + "step": 4222, + "time_per_iteration": 2.8018221855163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020384, + "balance_loss_mlp": 1.00932121, + "epoch": 0.8124278568680262, + "flos": 1529747970048.0, + "grad_norm": 0.011965334136789936, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79270458, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.11083984, + "step": 4223, + "time_per_iteration": 5.00577974319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067514, + "balance_loss_mlp": 1.03826034, + "epoch": 0.8126202385532897, + "flos": 432640438272.0, + "grad_norm": 0.06449105451981865, + "language_loss": 0.79671866, + "learning_rate": 8.928557430748668e-05, + "loss": 0.80739379, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.29248047, + "step": 4224, + "time_per_iteration": 2.5818302631378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018632, + "balance_loss_mlp": 1.00756931, + "epoch": 0.8128126202385533, + "flos": 1547098665984.0, + "grad_norm": 0.01031409207183129, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77514255, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.11083984, + "step": 4225, + "time_per_iteration": 4.809314727783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069, + "balance_loss_mlp": 1.04081905, + "epoch": 0.8130050019238169, + "flos": 528064026624.0, + "grad_norm": 0.053998637656794475, + "language_loss": 0.88875234, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89944232, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.28173828, + "step": 4226, + "time_per_iteration": 2.647254705429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067755, + "balance_loss_mlp": 1.0394311, + "epoch": 0.8131973836090804, + "flos": 542850992640.0, + "grad_norm": 0.06919588802005232, + "language_loss": 0.79975605, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81043363, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.28320312, + "step": 4227, + "time_per_iteration": 2.726672410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070162, + "balance_loss_mlp": 1.0421958, + "epoch": 0.8133897652943439, + "flos": 576223953408.0, + "grad_norm": 0.0543859382223631, + "language_loss": 0.82038212, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83108377, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.2800293, + "step": 4228, + "time_per_iteration": 2.6778459548950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072299, + "balance_loss_mlp": 1.04404676, + "epoch": 0.8135821469796075, + "flos": 578937480192.0, + "grad_norm": 0.061060781274094984, + "language_loss": 0.78928632, + "learning_rate": 8.839918887251025e-05, + "loss": 0.80000931, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.28222656, + "step": 4229, + "time_per_iteration": 2.7937610149383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069859, + "balance_loss_mlp": 1.04208326, + "epoch": 0.8137745286648711, + "flos": 650023543296.0, + "grad_norm": 0.05733446372690566, + "language_loss": 0.83721739, + "learning_rate": 8.822239090334472e-05, + "loss": 0.84791595, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.27783203, + "step": 4230, + "time_per_iteration": 2.929072141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068776, + "balance_loss_mlp": 1.03980827, + "epoch": 0.8139669103501347, + "flos": 701579446272.0, + "grad_norm": 0.055172445682410025, + "language_loss": 0.75769949, + "learning_rate": 8.804575280042493e-05, + "loss": 0.7683872, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.28955078, + "step": 4231, + "time_per_iteration": 2.9424638748168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.04748487, + "epoch": 0.8141592920353983, + "flos": 649933383168.0, + "grad_norm": 0.06203096167120011, + "language_loss": 0.83420956, + "learning_rate": 8.786927463232774e-05, + "loss": 0.84496653, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.28198242, + "step": 4232, + "time_per_iteration": 2.758073091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010716, + "balance_loss_mlp": 1.04377663, + "epoch": 0.8143516737206618, + "flos": 536577136128.0, + "grad_norm": 0.060605640781893975, + "language_loss": 0.81175333, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82246929, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.27856445, + "step": 4233, + "time_per_iteration": 2.5830631256103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068573, + "balance_loss_mlp": 1.04041553, + "epoch": 0.8145440554059253, + "flos": 508117667328.0, + "grad_norm": 0.06950622119523395, + "language_loss": 0.82293272, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83361846, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.28149414, + "step": 4234, + "time_per_iteration": 2.5787734985351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069076, + "balance_loss_mlp": 1.04125214, + "epoch": 0.8147364370911889, + "flos": 634720043520.0, + "grad_norm": 0.06263020713850469, + "language_loss": 0.86699188, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87768269, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.27856445, + "step": 4235, + "time_per_iteration": 2.821223735809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_mlp": 1.03743625, + "epoch": 0.8149288187764525, + "flos": 422576317440.0, + "grad_norm": 0.0652768049797803, + "language_loss": 0.78442669, + "learning_rate": 8.716496267753343e-05, + "loss": 0.79509175, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.29052734, + "step": 4236, + "time_per_iteration": 2.4675498008728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_mlp": 1.03853941, + "epoch": 0.8151212004617161, + "flos": 597150945792.0, + "grad_norm": 0.07602341505053914, + "language_loss": 0.81648099, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82714319, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.27709961, + "step": 4237, + "time_per_iteration": 2.7590246200561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014981, + "balance_loss_mlp": 1.00382304, + "epoch": 0.8153135821469796, + "flos": 1478563186176.0, + "grad_norm": 0.008637050381311823, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78867829, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.11181641, + "step": 4238, + "time_per_iteration": 4.97124171257019 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071451, + "balance_loss_mlp": 1.04236352, + "epoch": 0.8155059638322432, + "flos": 436870070784.0, + "grad_norm": 0.06276879844041765, + "language_loss": 0.82607353, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83678806, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.29052734, + "step": 4239, + "time_per_iteration": 2.5108706951141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067786, + "balance_loss_mlp": 1.04036808, + "epoch": 0.8156983455175068, + "flos": 794034842112.0, + "grad_norm": 0.05812108506294299, + "language_loss": 0.85652077, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86719859, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.27490234, + "step": 4240, + "time_per_iteration": 3.0461056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069826, + "balance_loss_mlp": 1.04166925, + "epoch": 0.8158907272027703, + "flos": 685685219328.0, + "grad_norm": 0.05086326935086867, + "language_loss": 0.81733894, + "learning_rate": 8.628817947092616e-05, + "loss": 0.8280372, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.28173828, + "step": 4241, + "time_per_iteration": 2.8256101608276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_mlp": 1.04071116, + "epoch": 0.8160831088880338, + "flos": 486812353536.0, + "grad_norm": 0.07447614758134384, + "language_loss": 0.84482515, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85551053, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.27832031, + "step": 4242, + "time_per_iteration": 2.5818676948547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069793, + "balance_loss_mlp": 1.04144478, + "epoch": 0.8162754905732974, + "flos": 464635505664.0, + "grad_norm": 0.058835558932383195, + "language_loss": 0.80352938, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81422722, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.28369141, + "step": 4243, + "time_per_iteration": 2.5835306644439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012284, + "balance_loss_mlp": 1.00117409, + "epoch": 0.816467872258561, + "flos": 1238879577600.0, + "grad_norm": 0.007644288971294211, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76297402, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.11132812, + "step": 4244, + "time_per_iteration": 4.721221446990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_mlp": 1.04399562, + "epoch": 0.8166602539438246, + "flos": 686862291456.0, + "grad_norm": 0.059716392905671066, + "language_loss": 0.86529738, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87601984, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.2824707, + "step": 4245, + "time_per_iteration": 2.9283206462860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013346, + "balance_loss_mlp": 1.00228322, + "epoch": 0.8168526356290882, + "flos": 1489674779136.0, + "grad_norm": 0.007574465559788524, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73988086, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.11083984, + "step": 4246, + "time_per_iteration": 4.933375358581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_mlp": 1.04230392, + "epoch": 0.8170450173143516, + "flos": 577927733760.0, + "grad_norm": 0.046146442587004816, + "language_loss": 0.84699905, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85770321, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.28125, + "step": 4247, + "time_per_iteration": 2.73640513420105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072398, + "balance_loss_mlp": 1.04476547, + "epoch": 0.8172373989996152, + "flos": 570985984512.0, + "grad_norm": 0.057815489386057996, + "language_loss": 0.84281337, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85353732, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.27685547, + "step": 4248, + "time_per_iteration": 2.7503533363342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070358, + "balance_loss_mlp": 1.04289269, + "epoch": 0.8174297806848788, + "flos": 528576178176.0, + "grad_norm": 0.05981419977857885, + "language_loss": 0.80560964, + "learning_rate": 8.489368195241948e-05, + "loss": 0.81631327, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.27514648, + "step": 4249, + "time_per_iteration": 2.633897066116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066258, + "balance_loss_mlp": 1.03798175, + "epoch": 0.8176221623701424, + "flos": 568819514880.0, + "grad_norm": 0.05344644300420973, + "language_loss": 0.78959692, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80025947, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.28295898, + "step": 4250, + "time_per_iteration": 2.801112174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106823, + "balance_loss_mlp": 1.03904736, + "epoch": 0.8178145440554059, + "flos": 656226178560.0, + "grad_norm": 0.06435055632963133, + "language_loss": 0.80169028, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81237257, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.29174805, + "step": 4251, + "time_per_iteration": 2.854274272918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072819, + "balance_loss_mlp": 1.04451823, + "epoch": 0.8180069257406695, + "flos": 545643095040.0, + "grad_norm": 0.047060822290908425, + "language_loss": 0.87586474, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88659286, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.28271484, + "step": 4252, + "time_per_iteration": 2.7088351249694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072153, + "balance_loss_mlp": 1.04359007, + "epoch": 0.818199307425933, + "flos": 618987350016.0, + "grad_norm": 0.07063067234583648, + "language_loss": 0.84892482, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85964632, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.28588867, + "step": 4253, + "time_per_iteration": 2.7312068939208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107228, + "balance_loss_mlp": 1.04436111, + "epoch": 0.8183916891111966, + "flos": 647291077632.0, + "grad_norm": 0.06397953963457907, + "language_loss": 0.77154791, + "learning_rate": 8.402735645731157e-05, + "loss": 0.78227079, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.27929688, + "step": 4254, + "time_per_iteration": 2.9217798709869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069202, + "balance_loss_mlp": 1.0413785, + "epoch": 0.8185840707964602, + "flos": 498875618304.0, + "grad_norm": 0.06114349210328935, + "language_loss": 0.77897936, + "learning_rate": 8.385457557424098e-05, + "loss": 0.78967136, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.27856445, + "step": 4255, + "time_per_iteration": 2.5912728309631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072136, + "balance_loss_mlp": 1.04407382, + "epoch": 0.8187764524817237, + "flos": 785885497344.0, + "grad_norm": 0.04533193109086393, + "language_loss": 0.79436147, + "learning_rate": 8.368195625315251e-05, + "loss": 0.8050828, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.28051758, + "step": 4256, + "time_per_iteration": 3.0689914226531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067872, + "balance_loss_mlp": 1.03961968, + "epoch": 0.8189688341669873, + "flos": 550443105792.0, + "grad_norm": 0.04938986425067683, + "language_loss": 0.80494475, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81562352, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.28271484, + "step": 4257, + "time_per_iteration": 2.8081703186035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016419, + "balance_loss_mlp": 1.00545204, + "epoch": 0.8191612158522509, + "flos": 1351247837184.0, + "grad_norm": 0.007513111853899237, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72165769, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.10986328, + "step": 4258, + "time_per_iteration": 4.860759973526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066755, + "balance_loss_mlp": 1.03884852, + "epoch": 0.8193535975375145, + "flos": 543997541376.0, + "grad_norm": 0.09856847418015399, + "language_loss": 0.83568203, + "learning_rate": 8.316506833163318e-05, + "loss": 0.8463496, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.27893066, + "step": 4259, + "time_per_iteration": 2.6318304538726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067228, + "balance_loss_mlp": 1.0395236, + "epoch": 0.8195459792227779, + "flos": 865361014272.0, + "grad_norm": 0.04796086797261532, + "language_loss": 0.8533324, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86400461, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.27709961, + "step": 4260, + "time_per_iteration": 3.097459554672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107212, + "balance_loss_mlp": 1.04343772, + "epoch": 0.8197383609080415, + "flos": 569015953920.0, + "grad_norm": 0.06519487649428121, + "language_loss": 0.81389135, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82461256, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.28686523, + "step": 4261, + "time_per_iteration": 2.7116708755493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067185, + "balance_loss_mlp": 1.03905129, + "epoch": 0.8199307425933051, + "flos": 530546208768.0, + "grad_norm": 0.0751813797891333, + "language_loss": 0.85112655, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86179835, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.28100586, + "step": 4262, + "time_per_iteration": 2.6388864517211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069284, + "balance_loss_mlp": 1.04086471, + "epoch": 0.8201231242785687, + "flos": 566781083136.0, + "grad_norm": 0.08342870078967202, + "language_loss": 0.85002542, + "learning_rate": 8.247815036252921e-05, + "loss": 0.86071831, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.28393555, + "step": 4263, + "time_per_iteration": 2.720921516418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068453, + "balance_loss_mlp": 1.04067707, + "epoch": 0.8203155059638323, + "flos": 1230037913088.0, + "grad_norm": 0.05275924450375894, + "language_loss": 0.83059227, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84127676, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.27807617, + "step": 4264, + "time_per_iteration": 3.537928819656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065474, + "balance_loss_mlp": 1.03722143, + "epoch": 0.8205078876490958, + "flos": 573929445888.0, + "grad_norm": 0.07194471944274317, + "language_loss": 0.79793882, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80859357, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.2824707, + "step": 4265, + "time_per_iteration": 2.677060604095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070943, + "balance_loss_mlp": 1.04238069, + "epoch": 0.8207002693343594, + "flos": 931005467136.0, + "grad_norm": 0.05368978054218888, + "language_loss": 0.78217483, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79288435, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.28564453, + "step": 4266, + "time_per_iteration": 3.2091941833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069743, + "balance_loss_mlp": 1.04175258, + "epoch": 0.8208926510196229, + "flos": 549300939264.0, + "grad_norm": 0.05227458424297275, + "language_loss": 0.80184317, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81254053, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.27966309, + "step": 4267, + "time_per_iteration": 2.6503403186798096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071065, + "balance_loss_mlp": 1.04224026, + "epoch": 0.8210850327048865, + "flos": 647876012544.0, + "grad_norm": 0.055684588368156915, + "language_loss": 0.81990433, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83061492, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.28833008, + "step": 4268, + "time_per_iteration": 2.8474974632263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.03712273, + "epoch": 0.82127741439015, + "flos": 601227809280.0, + "grad_norm": 0.05335039866685649, + "language_loss": 0.81779087, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82844484, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.28271484, + "step": 4269, + "time_per_iteration": 2.7657508850097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072754, + "balance_loss_mlp": 1.04412019, + "epoch": 0.8214697960754136, + "flos": 474577380864.0, + "grad_norm": 0.05697164885970493, + "language_loss": 0.83394897, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84467655, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.28637695, + "step": 4270, + "time_per_iteration": 2.7368545532226562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069379, + "balance_loss_mlp": 1.04169905, + "epoch": 0.8216621777606772, + "flos": 903288084480.0, + "grad_norm": 0.06652407290888228, + "language_loss": 0.84682125, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85751498, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.27734375, + "step": 4271, + "time_per_iteration": 3.138782024383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067361, + "balance_loss_mlp": 1.0394659, + "epoch": 0.8218545594459408, + "flos": 514203849216.0, + "grad_norm": 0.05826092076561135, + "language_loss": 0.81998187, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83065546, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.27929688, + "step": 4272, + "time_per_iteration": 2.597888231277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068521, + "balance_loss_mlp": 1.03998256, + "epoch": 0.8220469411312044, + "flos": 494282221056.0, + "grad_norm": 0.05026815750554843, + "language_loss": 0.86299402, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87367922, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.28515625, + "step": 4273, + "time_per_iteration": 2.562731981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067588, + "balance_loss_mlp": 1.03883505, + "epoch": 0.8222393228164678, + "flos": 386223579648.0, + "grad_norm": 0.07057858042680534, + "language_loss": 0.89472818, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90540403, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.28710938, + "step": 4274, + "time_per_iteration": 2.3851494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072526, + "balance_loss_mlp": 1.0439868, + "epoch": 0.8224317045017314, + "flos": 536331234816.0, + "grad_norm": 0.057023216193292044, + "language_loss": 0.87000436, + "learning_rate": 8.043297468527383e-05, + "loss": 0.88072956, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.28588867, + "step": 4275, + "time_per_iteration": 2.6285390853881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_mlp": 1.03897595, + "epoch": 0.822624086186995, + "flos": 554637832704.0, + "grad_norm": 0.060348854107393414, + "language_loss": 0.82261753, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83329076, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.28369141, + "step": 4276, + "time_per_iteration": 2.6405022144317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.03544676, + "epoch": 0.8228164678722586, + "flos": 539296455168.0, + "grad_norm": 0.060894679283369814, + "language_loss": 0.79943031, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81007135, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.28637695, + "step": 4277, + "time_per_iteration": 2.6903491020202637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069836, + "balance_loss_mlp": 1.04134488, + "epoch": 0.8230088495575221, + "flos": 473001638400.0, + "grad_norm": 0.06253294625851578, + "language_loss": 0.7949158, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80561417, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.28515625, + "step": 4278, + "time_per_iteration": 2.727847099304199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068873, + "balance_loss_mlp": 1.04066813, + "epoch": 0.8232012312427857, + "flos": 591403797504.0, + "grad_norm": 0.07862072734011541, + "language_loss": 0.82629663, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83698535, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.28222656, + "step": 4279, + "time_per_iteration": 2.7080447673797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_mlp": 1.03942037, + "epoch": 0.8233936129280492, + "flos": 572359495680.0, + "grad_norm": 0.05419892783143061, + "language_loss": 0.74572438, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75640255, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.28417969, + "step": 4280, + "time_per_iteration": 2.7673287391662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071165, + "balance_loss_mlp": 1.04324651, + "epoch": 0.8235859946133128, + "flos": 730986052608.0, + "grad_norm": 0.05042929375958854, + "language_loss": 0.78113925, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79185092, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.27905273, + "step": 4281, + "time_per_iteration": 3.041469097137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070743, + "balance_loss_mlp": 1.04334915, + "epoch": 0.8237783762985764, + "flos": 570044639232.0, + "grad_norm": 0.0829894991194988, + "language_loss": 0.81421649, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82492399, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.27416992, + "step": 4282, + "time_per_iteration": 2.722827434539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021268, + "balance_loss_mlp": 1.01049173, + "epoch": 0.8239707579838399, + "flos": 1465437740544.0, + "grad_norm": 0.01007107364223027, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76318944, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.10791016, + "step": 4283, + "time_per_iteration": 5.00859522819519 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064022, + "balance_loss_mlp": 1.0352931, + "epoch": 0.8241631396691035, + "flos": 467068225536.0, + "grad_norm": 0.060969567614712394, + "language_loss": 0.80811769, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81875789, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.28710938, + "step": 4284, + "time_per_iteration": 2.6598501205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019005, + "balance_loss_mlp": 1.0081805, + "epoch": 0.8243555213543671, + "flos": 1538648165376.0, + "grad_norm": 0.008520908509729544, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78953278, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.10839844, + "step": 4285, + "time_per_iteration": 4.975403308868408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106329, + "balance_loss_mlp": 1.03549051, + "epoch": 0.8245479030396307, + "flos": 797072845824.0, + "grad_norm": 0.046519887355449104, + "language_loss": 0.82528639, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83591926, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.27807617, + "step": 4286, + "time_per_iteration": 3.1330010890960693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_mlp": 1.03985691, + "epoch": 0.8247402847248941, + "flos": 645793910784.0, + "grad_norm": 0.05584365846418652, + "language_loss": 0.76650226, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77718425, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.28344727, + "step": 4287, + "time_per_iteration": 2.874102830886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067715, + "balance_loss_mlp": 1.03965342, + "epoch": 0.8249326664101577, + "flos": 604123218432.0, + "grad_norm": 0.046467705900978235, + "language_loss": 0.79150665, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80218387, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.28076172, + "step": 4288, + "time_per_iteration": 2.766347646713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071486, + "balance_loss_mlp": 1.04344761, + "epoch": 0.8251250480954213, + "flos": 824006034432.0, + "grad_norm": 0.07598367215213916, + "language_loss": 0.85994101, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87065583, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.28076172, + "step": 4289, + "time_per_iteration": 3.083522319793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066482, + "balance_loss_mlp": 1.03877819, + "epoch": 0.8253174297806849, + "flos": 757060853760.0, + "grad_norm": 0.06810151606712053, + "language_loss": 0.78171742, + "learning_rate": 7.790950350913112e-05, + "loss": 0.79238224, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.27758789, + "step": 4290, + "time_per_iteration": 2.9262516498565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.03983259, + "epoch": 0.8255098114659485, + "flos": 794090096640.0, + "grad_norm": 0.050696526133381645, + "language_loss": 0.87615943, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88684154, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.28369141, + "step": 4291, + "time_per_iteration": 3.1552226543426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066383, + "balance_loss_mlp": 1.03846407, + "epoch": 0.825702193151212, + "flos": 710102730240.0, + "grad_norm": 0.05400695782122637, + "language_loss": 0.7710315, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78169525, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.27954102, + "step": 4292, + "time_per_iteration": 2.874351739883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066694, + "balance_loss_mlp": 1.03920412, + "epoch": 0.8258945748364755, + "flos": 683076409344.0, + "grad_norm": 0.05651405628127392, + "language_loss": 0.80610162, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81676853, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.27514648, + "step": 4293, + "time_per_iteration": 2.913649559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064767, + "balance_loss_mlp": 1.03641856, + "epoch": 0.8260869565217391, + "flos": 594284649984.0, + "grad_norm": 0.0655769338996001, + "language_loss": 0.79001105, + "learning_rate": 7.724279585440186e-05, + "loss": 0.8006587, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.28369141, + "step": 4294, + "time_per_iteration": 2.6959924697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106916, + "balance_loss_mlp": 1.0408597, + "epoch": 0.8262793382070027, + "flos": 651189030912.0, + "grad_norm": 0.06271254598374965, + "language_loss": 0.85122335, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86191493, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.28320312, + "step": 4295, + "time_per_iteration": 2.778247594833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106329, + "balance_loss_mlp": 1.03472757, + "epoch": 0.8264717198922663, + "flos": 538665030144.0, + "grad_norm": 0.06229356932536235, + "language_loss": 0.84610021, + "learning_rate": 7.691042654177315e-05, + "loss": 0.85673308, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.28564453, + "step": 4296, + "time_per_iteration": 2.631758689880371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066177, + "balance_loss_mlp": 1.0383538, + "epoch": 0.8266641015775298, + "flos": 538689761280.0, + "grad_norm": 0.05860018207960959, + "language_loss": 0.75458044, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76524222, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.27807617, + "step": 4297, + "time_per_iteration": 2.6441447734832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066814, + "balance_loss_mlp": 1.03894281, + "epoch": 0.8268564832627934, + "flos": 585077506560.0, + "grad_norm": 0.10598149445543782, + "language_loss": 0.83691001, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84757817, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.27905273, + "step": 4298, + "time_per_iteration": 2.7704553604125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063706, + "balance_loss_mlp": 1.0360496, + "epoch": 0.827048864948057, + "flos": 430434680832.0, + "grad_norm": 0.06384628613684656, + "language_loss": 0.84164608, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85228312, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.27685547, + "step": 4299, + "time_per_iteration": 2.4719276428222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066203, + "balance_loss_mlp": 1.03866601, + "epoch": 0.8272412466333205, + "flos": 1387915181568.0, + "grad_norm": 0.05066688700219157, + "language_loss": 0.85216463, + "learning_rate": 7.624765952673069e-05, + "loss": 0.86282665, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.27563477, + "step": 4300, + "time_per_iteration": 3.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.03889418, + "epoch": 0.827433628318584, + "flos": 537952057344.0, + "grad_norm": 0.054637515745130344, + "language_loss": 0.82762563, + "learning_rate": 7.608237890043335e-05, + "loss": 0.83829165, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.27734375, + "step": 4301, + "time_per_iteration": 2.718935966491699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069831, + "balance_loss_mlp": 1.04172134, + "epoch": 0.8276260100038476, + "flos": 730404089856.0, + "grad_norm": 0.062402863690690924, + "language_loss": 0.77286649, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78356481, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.28125, + "step": 4302, + "time_per_iteration": 2.979245185852051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069858, + "balance_loss_mlp": 1.04239202, + "epoch": 0.8278183916891112, + "flos": 870713874432.0, + "grad_norm": 0.0506617431069229, + "language_loss": 0.82704937, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83774793, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.27490234, + "step": 4303, + "time_per_iteration": 3.2166168689727783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067752, + "balance_loss_mlp": 1.04016745, + "epoch": 0.8280107733743748, + "flos": 594255536640.0, + "grad_norm": 0.04830635372046053, + "language_loss": 0.77627051, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78694797, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.27612305, + "step": 4304, + "time_per_iteration": 2.784526824951172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_mlp": 1.04625297, + "epoch": 0.8282031550596384, + "flos": 768253994496.0, + "grad_norm": 0.06238860390142307, + "language_loss": 0.84069538, + "learning_rate": 7.542290283012653e-05, + "loss": 0.85143757, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.27978516, + "step": 4305, + "time_per_iteration": 3.015488624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064963, + "balance_loss_mlp": 1.03675771, + "epoch": 0.8283955367449019, + "flos": 695775481344.0, + "grad_norm": 0.05683033196778672, + "language_loss": 0.77687621, + "learning_rate": 7.525844574130947e-05, + "loss": 0.78752589, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.28222656, + "step": 4306, + "time_per_iteration": 2.9644808769226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066712, + "balance_loss_mlp": 1.03919816, + "epoch": 0.8285879184301654, + "flos": 660304452096.0, + "grad_norm": 0.06215000066971459, + "language_loss": 0.82671452, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83738166, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.27514648, + "step": 4307, + "time_per_iteration": 2.9103474617004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071596, + "balance_loss_mlp": 1.04320002, + "epoch": 0.828780300115429, + "flos": 558444063744.0, + "grad_norm": 0.06487976021582191, + "language_loss": 0.77909887, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78981483, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.28417969, + "step": 4308, + "time_per_iteration": 2.667210340499878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067943, + "balance_loss_mlp": 1.03940439, + "epoch": 0.8289726818006926, + "flos": 830613132288.0, + "grad_norm": 0.05657563872509185, + "language_loss": 0.81739187, + "learning_rate": 7.476606412570352e-05, + "loss": 0.82807136, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.28540039, + "step": 4309, + "time_per_iteration": 3.112323760986328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068169, + "balance_loss_mlp": 1.04036903, + "epoch": 0.8291650634859561, + "flos": 731974040064.0, + "grad_norm": 0.06578058701317972, + "language_loss": 0.81024778, + "learning_rate": 7.460226701651624e-05, + "loss": 0.82092953, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.27807617, + "step": 4310, + "time_per_iteration": 2.8983981609344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106771, + "balance_loss_mlp": 1.03902817, + "epoch": 0.8293574451712197, + "flos": 860521715712.0, + "grad_norm": 0.047369684545673044, + "language_loss": 0.81142193, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82209897, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.28662109, + "step": 4311, + "time_per_iteration": 3.1817171573638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068307, + "balance_loss_mlp": 1.04048347, + "epoch": 0.8295498268564833, + "flos": 494874358272.0, + "grad_norm": 0.047477241670426974, + "language_loss": 0.81896996, + "learning_rate": 7.427516832380948e-05, + "loss": 0.82965302, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.27856445, + "step": 4312, + "time_per_iteration": 2.823458671569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067657, + "balance_loss_mlp": 1.04007173, + "epoch": 0.8297422085417469, + "flos": 554176553472.0, + "grad_norm": 0.05048838223449801, + "language_loss": 0.77711129, + "learning_rate": 7.4111866867281e-05, + "loss": 0.78778785, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.27612305, + "step": 4313, + "time_per_iteration": 2.7841291427612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064359, + "balance_loss_mlp": 1.03624964, + "epoch": 0.8299345902270104, + "flos": 1247001523200.0, + "grad_norm": 0.053354105207562584, + "language_loss": 0.77411175, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78475529, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.28100586, + "step": 4314, + "time_per_iteration": 3.7357640266418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071472, + "balance_loss_mlp": 1.04350495, + "epoch": 0.8301269719122739, + "flos": 584974199808.0, + "grad_norm": 0.06431532292793385, + "language_loss": 0.83130819, + "learning_rate": 7.378576005087034e-05, + "loss": 0.8420229, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.2800293, + "step": 4315, + "time_per_iteration": 2.7655749320983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.03692484, + "epoch": 0.8303193535975375, + "flos": 509472239616.0, + "grad_norm": 0.05482661069569197, + "language_loss": 0.85277319, + "learning_rate": 7.362295481759412e-05, + "loss": 0.863424, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.28198242, + "step": 4316, + "time_per_iteration": 2.6888644695281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065395, + "balance_loss_mlp": 1.03711891, + "epoch": 0.8305117352828011, + "flos": 580375010304.0, + "grad_norm": 0.06137401051825932, + "language_loss": 0.83732426, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84797823, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.28271484, + "step": 4317, + "time_per_iteration": 2.73391056060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106906, + "balance_loss_mlp": 1.04035425, + "epoch": 0.8307041169680647, + "flos": 481372153344.0, + "grad_norm": 0.368897655418688, + "language_loss": 0.78677309, + "learning_rate": 7.329784101693232e-05, + "loss": 0.79746372, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.28686523, + "step": 4318, + "time_per_iteration": 2.6239781379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071574, + "balance_loss_mlp": 1.04284477, + "epoch": 0.8308964986533282, + "flos": 624319861248.0, + "grad_norm": 0.05860908770024719, + "language_loss": 0.83063138, + "learning_rate": 7.313553257576727e-05, + "loss": 0.84134716, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.28662109, + "step": 4319, + "time_per_iteration": 2.7097573280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068434, + "balance_loss_mlp": 1.04025316, + "epoch": 0.8310888803385917, + "flos": 826974226944.0, + "grad_norm": 0.06711883496181308, + "language_loss": 0.78550565, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79618996, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.28222656, + "step": 4320, + "time_per_iteration": 3.0357778072357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107015, + "balance_loss_mlp": 1.0416826, + "epoch": 0.8312812620238553, + "flos": 583443537408.0, + "grad_norm": 0.05319992693282762, + "language_loss": 0.81702912, + "learning_rate": 7.281141292683746e-05, + "loss": 0.82773066, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.28491211, + "step": 4321, + "time_per_iteration": 2.8347558975219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.04227519, + "epoch": 0.8314736437091189, + "flos": 1115165560320.0, + "grad_norm": 0.06107038935899217, + "language_loss": 0.74773026, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75843954, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.28613281, + "step": 4322, + "time_per_iteration": 3.407073497772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071357, + "balance_loss_mlp": 1.04262769, + "epoch": 0.8316660253943825, + "flos": 517295697408.0, + "grad_norm": 0.07290266812750479, + "language_loss": 0.8181231, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82883668, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.28710938, + "step": 4323, + "time_per_iteration": 2.848313093185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_mlp": 1.04257011, + "epoch": 0.831858407079646, + "flos": 794989334016.0, + "grad_norm": 0.05477920158119857, + "language_loss": 0.78024107, + "learning_rate": 7.232647748021864e-05, + "loss": 0.79094219, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.27563477, + "step": 4324, + "time_per_iteration": 3.032369375228882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076179, + "balance_loss_mlp": 1.048141, + "epoch": 0.8320507887649096, + "flos": 549699609600.0, + "grad_norm": 0.05911320807574519, + "language_loss": 0.82844627, + "learning_rate": 7.216516432290843e-05, + "loss": 0.83920801, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.28076172, + "step": 4325, + "time_per_iteration": 2.675576686859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074419, + "balance_loss_mlp": 1.04580855, + "epoch": 0.8322431704501732, + "flos": 479160603648.0, + "grad_norm": 0.06505000909529828, + "language_loss": 0.81961429, + "learning_rate": 7.20040172658123e-05, + "loss": 0.83035839, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.28588867, + "step": 4326, + "time_per_iteration": 2.6014811992645264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072863, + "balance_loss_mlp": 1.04430079, + "epoch": 0.8324355521354367, + "flos": 572157264384.0, + "grad_norm": 0.04659300495959616, + "language_loss": 0.8545717, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86530042, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.28564453, + "step": 4327, + "time_per_iteration": 2.686389446258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070747, + "balance_loss_mlp": 1.04311395, + "epoch": 0.8326279338207002, + "flos": 503208557568.0, + "grad_norm": 0.0509990045281191, + "language_loss": 0.82115221, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83185971, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.27685547, + "step": 4328, + "time_per_iteration": 2.6402134895324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_mlp": 1.04157257, + "epoch": 0.8328203155059638, + "flos": 605442885120.0, + "grad_norm": 0.04952821718361573, + "language_loss": 0.80924785, + "learning_rate": 7.152157332111364e-05, + "loss": 0.81994963, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.28588867, + "step": 4329, + "time_per_iteration": 2.9259705543518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068872, + "balance_loss_mlp": 1.04033327, + "epoch": 0.8330126971912274, + "flos": 697469087232.0, + "grad_norm": 0.04841901744892354, + "language_loss": 0.85735106, + "learning_rate": 7.136109128985663e-05, + "loss": 0.86803973, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.28564453, + "step": 4330, + "time_per_iteration": 2.9183027744293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107316, + "balance_loss_mlp": 1.0450027, + "epoch": 0.833205078876491, + "flos": 493799182848.0, + "grad_norm": 0.054568548047455055, + "language_loss": 0.86655569, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87728733, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.28149414, + "step": 4331, + "time_per_iteration": 2.5831360816955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069703, + "balance_loss_mlp": 1.04176021, + "epoch": 0.8333974605617546, + "flos": 482568164352.0, + "grad_norm": 0.055811576976186876, + "language_loss": 0.8251605, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83585751, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.27954102, + "step": 4332, + "time_per_iteration": 2.5941505432128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070936, + "balance_loss_mlp": 1.0433507, + "epoch": 0.833589842247018, + "flos": 686517465600.0, + "grad_norm": 0.06675763221573856, + "language_loss": 0.82810611, + "learning_rate": 7.088064391927818e-05, + "loss": 0.83881545, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.27612305, + "step": 4333, + "time_per_iteration": 2.8070662021636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071446, + "balance_loss_mlp": 1.04231119, + "epoch": 0.8337822239322816, + "flos": 881377486848.0, + "grad_norm": 0.06204820087732955, + "language_loss": 0.82370806, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83442253, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.29101562, + "step": 4334, + "time_per_iteration": 3.121647834777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106899, + "balance_loss_mlp": 1.0413332, + "epoch": 0.8339746056175452, + "flos": 496940493312.0, + "grad_norm": 0.0625443757441557, + "language_loss": 0.8238197, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83450961, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.27685547, + "step": 4335, + "time_per_iteration": 2.6120407581329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074986, + "balance_loss_mlp": 1.04601824, + "epoch": 0.8341669873028088, + "flos": 510244849152.0, + "grad_norm": 0.06721642404221422, + "language_loss": 0.86205637, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87280619, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.28979492, + "step": 4336, + "time_per_iteration": 2.663907289505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072023, + "balance_loss_mlp": 1.04348373, + "epoch": 0.8343593689880723, + "flos": 692017302528.0, + "grad_norm": 0.06048352118494476, + "language_loss": 0.84131467, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85203493, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.28540039, + "step": 4337, + "time_per_iteration": 2.83551287651062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_mlp": 1.04602623, + "epoch": 0.8345517506733359, + "flos": 552132329472.0, + "grad_norm": 0.07231250032753044, + "language_loss": 0.78381979, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79456496, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.28491211, + "step": 4338, + "time_per_iteration": 2.792090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_mlp": 1.04401958, + "epoch": 0.8347441323585995, + "flos": 591750033408.0, + "grad_norm": 0.05928271157327828, + "language_loss": 0.76391554, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77464879, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.29296875, + "step": 4339, + "time_per_iteration": 2.8094851970672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071335, + "balance_loss_mlp": 1.04346359, + "epoch": 0.834936514043863, + "flos": 614625297408.0, + "grad_norm": 0.08334347707601203, + "language_loss": 0.84719282, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85790616, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.27905273, + "step": 4340, + "time_per_iteration": 2.7984797954559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_mlp": 1.04096282, + "epoch": 0.8351288957291266, + "flos": 467590551552.0, + "grad_norm": 0.05919982272479659, + "language_loss": 0.79683816, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80752361, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.27612305, + "step": 4341, + "time_per_iteration": 2.6437861919403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068887, + "balance_loss_mlp": 1.04046774, + "epoch": 0.8353212774143901, + "flos": 509063394816.0, + "grad_norm": 0.05870432477932672, + "language_loss": 0.78877068, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79945958, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.28417969, + "step": 4342, + "time_per_iteration": 2.666900157928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068798, + "balance_loss_mlp": 1.04121327, + "epoch": 0.8355136590996537, + "flos": 687477749760.0, + "grad_norm": 0.05888286602994688, + "language_loss": 0.80899429, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81968236, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.27612305, + "step": 4343, + "time_per_iteration": 2.825812339782715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070236, + "balance_loss_mlp": 1.04217434, + "epoch": 0.8357060407849173, + "flos": 984019249152.0, + "grad_norm": 0.07015017683216763, + "language_loss": 0.83694071, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84764308, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.28076172, + "step": 4344, + "time_per_iteration": 3.2398900985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071314, + "balance_loss_mlp": 1.04315686, + "epoch": 0.8358984224701809, + "flos": 842657458176.0, + "grad_norm": 0.04900467059707895, + "language_loss": 0.8505708, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86128396, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.28149414, + "step": 4345, + "time_per_iteration": 3.166189432144165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071828, + "balance_loss_mlp": 1.04424298, + "epoch": 0.8360908041554445, + "flos": 625945065984.0, + "grad_norm": 0.05480008181708294, + "language_loss": 0.81788313, + "learning_rate": 6.881605304306748e-05, + "loss": 0.82860136, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.27612305, + "step": 4346, + "time_per_iteration": 2.732534170150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067997, + "balance_loss_mlp": 1.03967237, + "epoch": 0.8362831858407079, + "flos": 575781613056.0, + "grad_norm": 0.05694009909818929, + "language_loss": 0.84824663, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85892665, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.28344727, + "step": 4347, + "time_per_iteration": 2.7295114994049072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068483, + "balance_loss_mlp": 1.03946793, + "epoch": 0.8364755675259715, + "flos": 833434348032.0, + "grad_norm": 0.07161579074567852, + "language_loss": 0.80623019, + "learning_rate": 6.850093130450569e-05, + "loss": 0.81691504, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.29003906, + "step": 4348, + "time_per_iteration": 3.0577757358551025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070469, + "balance_loss_mlp": 1.04193068, + "epoch": 0.8366679492112351, + "flos": 582211210752.0, + "grad_norm": 0.05716211740110942, + "language_loss": 0.86482334, + "learning_rate": 6.834362168884912e-05, + "loss": 0.8755281, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.28540039, + "step": 4349, + "time_per_iteration": 2.68066143989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069343, + "balance_loss_mlp": 1.04018426, + "epoch": 0.8368603308964987, + "flos": 611434524672.0, + "grad_norm": 0.061462223772575715, + "language_loss": 0.87587225, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88656569, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.29125977, + "step": 4350, + "time_per_iteration": 2.7892367839813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107107, + "balance_loss_mlp": 1.04310322, + "epoch": 0.8370527125817622, + "flos": 507014788608.0, + "grad_norm": 0.05225473338782787, + "language_loss": 0.8561269, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86683762, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.2800293, + "step": 4351, + "time_per_iteration": 2.7321066856384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066463, + "balance_loss_mlp": 1.03787637, + "epoch": 0.8372450942670258, + "flos": 770621285376.0, + "grad_norm": 0.049979512165668406, + "language_loss": 0.825046, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83571064, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.28564453, + "step": 4352, + "time_per_iteration": 2.9381721019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.04192686, + "epoch": 0.8374374759522893, + "flos": 579005881344.0, + "grad_norm": 0.053029874390874816, + "language_loss": 0.84654623, + "learning_rate": 6.771605967466033e-05, + "loss": 0.85725045, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.28491211, + "step": 4353, + "time_per_iteration": 2.691183090209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066821, + "balance_loss_mlp": 1.03847289, + "epoch": 0.8376298576375529, + "flos": 787781334528.0, + "grad_norm": 0.08828757782414506, + "language_loss": 0.82668114, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83734941, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.28344727, + "step": 4354, + "time_per_iteration": 2.9823262691497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067001, + "balance_loss_mlp": 1.03827119, + "epoch": 0.8378222393228165, + "flos": 577337006592.0, + "grad_norm": 0.05380745974011456, + "language_loss": 0.80749297, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81816292, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.28710938, + "step": 4355, + "time_per_iteration": 2.7626209259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067012, + "balance_loss_mlp": 1.03883111, + "epoch": 0.83801462100808, + "flos": 480618482688.0, + "grad_norm": 0.05633813219245277, + "language_loss": 0.81979787, + "learning_rate": 6.724715013945548e-05, + "loss": 0.83046794, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.28198242, + "step": 4356, + "time_per_iteration": 2.6264963150024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069337, + "balance_loss_mlp": 1.04089344, + "epoch": 0.8382070026933436, + "flos": 550523091456.0, + "grad_norm": 0.05081476567396691, + "language_loss": 0.89207625, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90276963, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.28442383, + "step": 4357, + "time_per_iteration": 2.842620849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_mlp": 1.04205918, + "epoch": 0.8383993843786072, + "flos": 624655922688.0, + "grad_norm": 0.07040298629212442, + "language_loss": 0.8180182, + "learning_rate": 6.693538372929725e-05, + "loss": 0.82872057, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.28198242, + "step": 4358, + "time_per_iteration": 2.916688919067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063253, + "balance_loss_mlp": 1.03504848, + "epoch": 0.8385917660638708, + "flos": 490928504832.0, + "grad_norm": 0.06176298645937789, + "language_loss": 0.86094594, + "learning_rate": 6.677975268986719e-05, + "loss": 0.87157845, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.28222656, + "step": 4359, + "time_per_iteration": 2.5380067825317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065272, + "balance_loss_mlp": 1.03675675, + "epoch": 0.8387841477491342, + "flos": 466659380736.0, + "grad_norm": 0.05670082707538084, + "language_loss": 0.86943793, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88009059, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.28515625, + "step": 4360, + "time_per_iteration": 2.5779833793640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013763, + "balance_loss_mlp": 1.00289118, + "epoch": 0.8389765294343978, + "flos": 1563339128832.0, + "grad_norm": 0.010559991711123677, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72793949, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.10888672, + "step": 4361, + "time_per_iteration": 4.992979526519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065453, + "balance_loss_mlp": 1.03729582, + "epoch": 0.8391689111196614, + "flos": 601849059840.0, + "grad_norm": 0.04961232743748672, + "language_loss": 0.8271215, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83777601, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.28125, + "step": 4362, + "time_per_iteration": 2.8584516048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064915, + "balance_loss_mlp": 1.0364244, + "epoch": 0.839361292804925, + "flos": 442818040320.0, + "grad_norm": 0.06952447203418213, + "language_loss": 0.80247456, + "learning_rate": 6.615891104554261e-05, + "loss": 0.8131237, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.28491211, + "step": 4363, + "time_per_iteration": 2.55979585647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065438, + "balance_loss_mlp": 1.03654134, + "epoch": 0.8395536744901886, + "flos": 593885979648.0, + "grad_norm": 0.05610159931926655, + "language_loss": 0.82741809, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83807242, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.28881836, + "step": 4364, + "time_per_iteration": 2.7487361431121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065297, + "balance_loss_mlp": 1.03752112, + "epoch": 0.8397460561754521, + "flos": 889462812672.0, + "grad_norm": 0.05813866241406409, + "language_loss": 0.85143423, + "learning_rate": 6.58495005748016e-05, + "loss": 0.86208725, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.27783203, + "step": 4365, + "time_per_iteration": 3.1682748794555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066254, + "balance_loss_mlp": 1.03759646, + "epoch": 0.8399384378607156, + "flos": 553239590400.0, + "grad_norm": 0.056651294792781116, + "language_loss": 0.89333951, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90400201, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.28637695, + "step": 4366, + "time_per_iteration": 2.639616012573242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062691, + "balance_loss_mlp": 1.03386617, + "epoch": 0.8401308195459792, + "flos": 518664826368.0, + "grad_norm": 0.04871038923450433, + "language_loss": 0.83355534, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84418219, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.28808594, + "step": 4367, + "time_per_iteration": 2.6365461349487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067679, + "balance_loss_mlp": 1.03925979, + "epoch": 0.8403232012312428, + "flos": 684593925120.0, + "grad_norm": 0.053676716876516345, + "language_loss": 0.80734771, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81802452, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.28417969, + "step": 4368, + "time_per_iteration": 3.066606044769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067558, + "balance_loss_mlp": 1.03925812, + "epoch": 0.8405155829165063, + "flos": 577424346624.0, + "grad_norm": 0.06042544525246531, + "language_loss": 0.77456969, + "learning_rate": 6.523270273863652e-05, + "loss": 0.78524524, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.28320312, + "step": 4369, + "time_per_iteration": 2.682929515838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064765, + "balance_loss_mlp": 1.03591669, + "epoch": 0.8407079646017699, + "flos": 456393028608.0, + "grad_norm": 0.061853619977902334, + "language_loss": 0.87804818, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88869584, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.28857422, + "step": 4370, + "time_per_iteration": 2.565526008605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068309, + "balance_loss_mlp": 1.03977025, + "epoch": 0.8409003462870335, + "flos": 534647803392.0, + "grad_norm": 0.06142629372428209, + "language_loss": 0.81581974, + "learning_rate": 6.492531633106114e-05, + "loss": 0.8265028, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.28515625, + "step": 4371, + "time_per_iteration": 2.7487144470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.03757024, + "epoch": 0.8410927279722971, + "flos": 556475443200.0, + "grad_norm": 0.0604641524505276, + "language_loss": 0.77816391, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78882331, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.28369141, + "step": 4372, + "time_per_iteration": 2.717592477798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011247, + "balance_loss_mlp": 1.00027978, + "epoch": 0.8412851096575606, + "flos": 1548963979776.0, + "grad_norm": 0.008659597915800551, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78690368, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.10986328, + "step": 4373, + "time_per_iteration": 4.928239583969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065882, + "balance_loss_mlp": 1.03791547, + "epoch": 0.8414774913428241, + "flos": 551777329152.0, + "grad_norm": 0.06413098641466736, + "language_loss": 0.78880799, + "learning_rate": 6.446550370075271e-05, + "loss": 0.79946685, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.27978516, + "step": 4374, + "time_per_iteration": 2.7013869285583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.0385704, + "epoch": 0.8416698730280877, + "flos": 572752373760.0, + "grad_norm": 0.061373783777205176, + "language_loss": 0.77249122, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78315514, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.27832031, + "step": 4375, + "time_per_iteration": 2.693763017654419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066918, + "balance_loss_mlp": 1.03818846, + "epoch": 0.8418622547133513, + "flos": 758405251584.0, + "grad_norm": 0.06754827285553786, + "language_loss": 0.79854172, + "learning_rate": 6.415980729547543e-05, + "loss": 0.8092109, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.28710938, + "step": 4376, + "time_per_iteration": 2.9242076873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063623, + "balance_loss_mlp": 1.03532255, + "epoch": 0.8420546363986149, + "flos": 1073717448192.0, + "grad_norm": 0.06121521331908178, + "language_loss": 0.72551686, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73615307, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.28295898, + "step": 4377, + "time_per_iteration": 3.4273428916931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106422, + "balance_loss_mlp": 1.03601491, + "epoch": 0.8422470180838784, + "flos": 525632716800.0, + "grad_norm": 0.0571677989475448, + "language_loss": 0.82702553, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83766776, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.28198242, + "step": 4378, + "time_per_iteration": 2.815692901611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069501, + "balance_loss_mlp": 1.04043794, + "epoch": 0.842439399769142, + "flos": 600552714240.0, + "grad_norm": 0.05646259355458583, + "language_loss": 0.82160503, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83230007, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.2902832, + "step": 4379, + "time_per_iteration": 2.7401773929595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067014, + "balance_loss_mlp": 1.03783143, + "epoch": 0.8426317814544055, + "flos": 551935890432.0, + "grad_norm": 0.05535422523937343, + "language_loss": 0.86565614, + "learning_rate": 6.355044545643073e-05, + "loss": 0.87632632, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.29150391, + "step": 4380, + "time_per_iteration": 2.7968811988830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063585, + "balance_loss_mlp": 1.03526044, + "epoch": 0.8428241631396691, + "flos": 678531064320.0, + "grad_norm": 0.06626388248762789, + "language_loss": 0.77773583, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78837168, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.28320312, + "step": 4381, + "time_per_iteration": 2.8227858543395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064902, + "balance_loss_mlp": 1.03676867, + "epoch": 0.8430165448249327, + "flos": 753365131776.0, + "grad_norm": 0.04710188444733999, + "language_loss": 0.79544091, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80608988, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.28149414, + "step": 4382, + "time_per_iteration": 3.1052286624908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067685, + "balance_loss_mlp": 1.03943205, + "epoch": 0.8432089265101962, + "flos": 698817867264.0, + "grad_norm": 0.054966422102889954, + "language_loss": 0.8069393, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81761611, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.28271484, + "step": 4383, + "time_per_iteration": 2.9371731281280518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065323, + "balance_loss_mlp": 1.03690386, + "epoch": 0.8434013081954598, + "flos": 485513035776.0, + "grad_norm": 0.061244567985189666, + "language_loss": 0.84519708, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85585028, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.28393555, + "step": 4384, + "time_per_iteration": 2.651392936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067975, + "balance_loss_mlp": 1.03957903, + "epoch": 0.8435936898807234, + "flos": 520372988928.0, + "grad_norm": 0.06241169246185324, + "language_loss": 0.85226697, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86294675, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.28442383, + "step": 4385, + "time_per_iteration": 2.6139471530914307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069425, + "balance_loss_mlp": 1.04119599, + "epoch": 0.843786071565987, + "flos": 785604690432.0, + "grad_norm": 0.05527502128053877, + "language_loss": 0.80296469, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81365895, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.2824707, + "step": 4386, + "time_per_iteration": 2.9919278621673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005481, + "balance_loss_mlp": 0.9944663, + "epoch": 0.8439784532512504, + "flos": 1445472442368.0, + "grad_norm": 0.008775712178222237, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76841992, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.11035156, + "step": 4387, + "time_per_iteration": 4.946225166320801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065228, + "balance_loss_mlp": 1.03621244, + "epoch": 0.844170834936514, + "flos": 708384393216.0, + "grad_norm": 0.057945273917654624, + "language_loss": 0.82541668, + "learning_rate": 6.23398588904906e-05, + "loss": 0.83606899, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.28955078, + "step": 4388, + "time_per_iteration": 2.8812713623046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067854, + "balance_loss_mlp": 1.03983974, + "epoch": 0.8443632166217776, + "flos": 483183622656.0, + "grad_norm": 0.057167711375516135, + "language_loss": 0.79827619, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80895472, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.28027344, + "step": 4389, + "time_per_iteration": 2.6971452236175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070953, + "balance_loss_mlp": 1.04274869, + "epoch": 0.8445555983070412, + "flos": 678388469760.0, + "grad_norm": 0.12061127187216408, + "language_loss": 0.80305707, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81376654, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.2824707, + "step": 4390, + "time_per_iteration": 2.8095004558563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072504, + "balance_loss_mlp": 1.04401278, + "epoch": 0.8447479799923048, + "flos": 741143305728.0, + "grad_norm": 0.05847375315335963, + "language_loss": 0.74079317, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75151819, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.28466797, + "step": 4391, + "time_per_iteration": 2.9872703552246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010687, + "balance_loss_mlp": 1.04137695, + "epoch": 0.8449403616775683, + "flos": 952897125888.0, + "grad_norm": 0.05269410537387695, + "language_loss": 0.80129778, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81198478, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.2734375, + "step": 4392, + "time_per_iteration": 3.261303186416626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071231, + "balance_loss_mlp": 1.04235804, + "epoch": 0.8451327433628318, + "flos": 657054042624.0, + "grad_norm": 0.06670363382703816, + "language_loss": 0.71812409, + "learning_rate": 6.158876260634871e-05, + "loss": 0.72883642, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.28857422, + "step": 4393, + "time_per_iteration": 2.943547010421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067618, + "balance_loss_mlp": 1.04055786, + "epoch": 0.8453251250480954, + "flos": 445880775168.0, + "grad_norm": 0.0616456163749573, + "language_loss": 0.83441478, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84509093, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.27124023, + "step": 4394, + "time_per_iteration": 2.5297040939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073842, + "balance_loss_mlp": 1.04475522, + "epoch": 0.845517506733359, + "flos": 542491610112.0, + "grad_norm": 0.0736590019033433, + "language_loss": 0.70597637, + "learning_rate": 6.128951512927305e-05, + "loss": 0.7167148, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.2902832, + "step": 4395, + "time_per_iteration": 2.6587178707122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068217, + "balance_loss_mlp": 1.04053688, + "epoch": 0.8457098884186226, + "flos": 502175490048.0, + "grad_norm": 0.050987666257807054, + "language_loss": 0.84470797, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85539019, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.27709961, + "step": 4396, + "time_per_iteration": 2.6455705165863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069179, + "balance_loss_mlp": 1.04106975, + "epoch": 0.8459022701038861, + "flos": 448643764224.0, + "grad_norm": 0.05256963604665797, + "language_loss": 0.79372364, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80441546, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.28125, + "step": 4397, + "time_per_iteration": 2.692250967025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068183, + "balance_loss_mlp": 1.040622, + "epoch": 0.8460946517891497, + "flos": 742855850496.0, + "grad_norm": 0.060494887314633476, + "language_loss": 0.74907923, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.7597611, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.27587891, + "step": 4398, + "time_per_iteration": 2.89249849319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066532, + "balance_loss_mlp": 1.03763604, + "epoch": 0.8462870334744133, + "flos": 552939844608.0, + "grad_norm": 0.04890785740935349, + "language_loss": 0.79848468, + "learning_rate": 6.069306450876389e-05, + "loss": 0.80915004, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.28833008, + "step": 4399, + "time_per_iteration": 2.771097421646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008457, + "balance_loss_mlp": 0.99753761, + "epoch": 0.8464794151596768, + "flos": 1564033162752.0, + "grad_norm": 0.008986072179428414, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82717025, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.109375, + "step": 4400, + "time_per_iteration": 4.860820055007935 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065203, + "balance_loss_mlp": 1.03702164, + "epoch": 0.8466717968449403, + "flos": 549930954240.0, + "grad_norm": 0.05293623699929889, + "language_loss": 0.79682398, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80747598, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.28222656, + "step": 4401, + "time_per_iteration": 2.6743388175964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067709, + "balance_loss_mlp": 1.03919387, + "epoch": 0.8468641785302039, + "flos": 551625970176.0, + "grad_norm": 0.06578160446582347, + "language_loss": 0.8447904, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85546756, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.28515625, + "step": 4402, + "time_per_iteration": 2.833575963973999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.0428741, + "epoch": 0.8470565602154675, + "flos": 572384226816.0, + "grad_norm": 0.06284331857121975, + "language_loss": 0.87002754, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88074219, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.28588867, + "step": 4403, + "time_per_iteration": 2.729248285293579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072184, + "balance_loss_mlp": 1.04428864, + "epoch": 0.8472489419007311, + "flos": 472597175808.0, + "grad_norm": 0.06686068658621137, + "language_loss": 0.84025908, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85098088, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.27929688, + "step": 4404, + "time_per_iteration": 2.5385451316833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066659, + "balance_loss_mlp": 1.03804839, + "epoch": 0.8474413235859947, + "flos": 797682511872.0, + "grad_norm": 0.061353729013317905, + "language_loss": 0.79223871, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80290532, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.28613281, + "step": 4405, + "time_per_iteration": 2.973203420639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072442, + "balance_loss_mlp": 1.04457116, + "epoch": 0.8476337052712581, + "flos": 502130409984.0, + "grad_norm": 0.06582777621595964, + "language_loss": 0.80370855, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.81443298, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.27929688, + "step": 4406, + "time_per_iteration": 2.5676045417785645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106656, + "balance_loss_mlp": 1.03864169, + "epoch": 0.8478260869565217, + "flos": 931586019840.0, + "grad_norm": 0.04644248356743638, + "language_loss": 0.83144867, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84211433, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.27954102, + "step": 4407, + "time_per_iteration": 3.197460412979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067365, + "balance_loss_mlp": 1.03937507, + "epoch": 0.8480184686417853, + "flos": 708502256640.0, + "grad_norm": 0.05623164949383599, + "language_loss": 0.80978203, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.82045567, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.2800293, + "step": 4408, + "time_per_iteration": 2.885713815689087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066856, + "balance_loss_mlp": 1.03819788, + "epoch": 0.8482108503270489, + "flos": 614152433664.0, + "grad_norm": 0.052126844540241135, + "language_loss": 0.82540518, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.83607376, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.28686523, + "step": 4409, + "time_per_iteration": 2.788428783416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071329, + "balance_loss_mlp": 1.04326701, + "epoch": 0.8484032320123124, + "flos": 530752822272.0, + "grad_norm": 0.07423031491114718, + "language_loss": 0.81877828, + "learning_rate": 5.906690709037194e-05, + "loss": 0.82949162, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.28100586, + "step": 4410, + "time_per_iteration": 2.687079429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101158, + "balance_loss_mlp": 1.00056553, + "epoch": 0.848595613697576, + "flos": 1541930508288.0, + "grad_norm": 0.009407978937322712, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77308494, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.11035156, + "step": 4411, + "time_per_iteration": 4.916358232498169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066224, + "balance_loss_mlp": 1.0385201, + "epoch": 0.8487879953828396, + "flos": 677025133056.0, + "grad_norm": 0.05688304553915402, + "language_loss": 0.73515522, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74581748, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.27734375, + "step": 4412, + "time_per_iteration": 2.943319797515869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066758, + "balance_loss_mlp": 1.03905368, + "epoch": 0.8489803770681031, + "flos": 503425345536.0, + "grad_norm": 0.06238044069939686, + "language_loss": 0.79501128, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.80567884, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.27734375, + "step": 4413, + "time_per_iteration": 2.701700448989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.04212689, + "epoch": 0.8491727587533667, + "flos": 562896276480.0, + "grad_norm": 0.056348175066762846, + "language_loss": 0.76581597, + "learning_rate": 5.84807086750247e-05, + "loss": 0.77651596, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.27905273, + "step": 4414, + "time_per_iteration": 2.7571372985839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071841, + "balance_loss_mlp": 1.04325449, + "epoch": 0.8493651404386302, + "flos": 459544513536.0, + "grad_norm": 0.06822958630668063, + "language_loss": 0.77977884, + "learning_rate": 5.833458746159243e-05, + "loss": 0.79049724, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.28588867, + "step": 4415, + "time_per_iteration": 2.5809409618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071242, + "balance_loss_mlp": 1.04294157, + "epoch": 0.8495575221238938, + "flos": 460928199168.0, + "grad_norm": 0.07027331723408024, + "language_loss": 0.81720734, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82791978, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.28320312, + "step": 4416, + "time_per_iteration": 2.6256165504455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.04189312, + "epoch": 0.8497499038091574, + "flos": 870353081856.0, + "grad_norm": 0.06359252463002799, + "language_loss": 0.81217146, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82286835, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.27807617, + "step": 4417, + "time_per_iteration": 3.1099212169647217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071152, + "balance_loss_mlp": 1.04246998, + "epoch": 0.849942285494421, + "flos": 779258050560.0, + "grad_norm": 0.0545072417760316, + "language_loss": 0.77756029, + "learning_rate": 5.789725286620018e-05, + "loss": 0.78827178, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.28637695, + "step": 4418, + "time_per_iteration": 2.990246534347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067448, + "balance_loss_mlp": 1.03909969, + "epoch": 0.8501346671796844, + "flos": 513544720896.0, + "grad_norm": 0.06431104376952325, + "language_loss": 0.84794027, + "learning_rate": 5.775181787135819e-05, + "loss": 0.85861474, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.28369141, + "step": 4419, + "time_per_iteration": 2.6921567916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072793, + "balance_loss_mlp": 1.04392087, + "epoch": 0.850327048864948, + "flos": 621149437440.0, + "grad_norm": 0.05225981984620765, + "language_loss": 0.83629984, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84702778, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.28833008, + "step": 4420, + "time_per_iteration": 2.843939781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070105, + "balance_loss_mlp": 1.04254413, + "epoch": 0.8505194305502116, + "flos": 773890633728.0, + "grad_norm": 0.056819561081510095, + "language_loss": 0.79734492, + "learning_rate": 5.746146302598454e-05, + "loss": 0.80804604, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.27587891, + "step": 4421, + "time_per_iteration": 3.0194528102874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069285, + "balance_loss_mlp": 1.04024506, + "epoch": 0.8507118122354752, + "flos": 465019619328.0, + "grad_norm": 0.05129689451431866, + "language_loss": 0.86400151, + "learning_rate": 5.731654328817859e-05, + "loss": 0.87469435, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.29052734, + "step": 4422, + "time_per_iteration": 2.5654053688049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068694, + "balance_loss_mlp": 1.04001236, + "epoch": 0.8509041939207388, + "flos": 534150208512.0, + "grad_norm": 0.05425122775065133, + "language_loss": 0.84819269, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85887969, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.28662109, + "step": 4423, + "time_per_iteration": 2.692744255065918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068587, + "balance_loss_mlp": 1.04031062, + "epoch": 0.8510965756060023, + "flos": 583466858496.0, + "grad_norm": 0.05733874896237715, + "language_loss": 0.84372598, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85441184, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.28295898, + "step": 4424, + "time_per_iteration": 2.7339928150177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_mlp": 1.03923082, + "epoch": 0.8512889572912659, + "flos": 600548332032.0, + "grad_norm": 0.05652647663414663, + "language_loss": 0.77350199, + "learning_rate": 5.688281548923796e-05, + "loss": 0.78418159, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.28735352, + "step": 4425, + "time_per_iteration": 2.757702589035034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068547, + "balance_loss_mlp": 1.03977025, + "epoch": 0.8514813389765294, + "flos": 654474345984.0, + "grad_norm": 0.056980152168257754, + "language_loss": 0.78801835, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79870379, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.28759766, + "step": 4426, + "time_per_iteration": 2.8438169956207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065367, + "balance_loss_mlp": 1.03742433, + "epoch": 0.851673720661793, + "flos": 429538415616.0, + "grad_norm": 0.06363966604968568, + "language_loss": 0.78092206, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79157573, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.27954102, + "step": 4427, + "time_per_iteration": 2.528083562850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069727, + "balance_loss_mlp": 1.04075956, + "epoch": 0.8518661023470565, + "flos": 641277679104.0, + "grad_norm": 0.06417237419479298, + "language_loss": 0.79616511, + "learning_rate": 5.645063599002875e-05, + "loss": 0.80686241, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.28979492, + "step": 4428, + "time_per_iteration": 2.7901835441589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067232, + "balance_loss_mlp": 1.03828812, + "epoch": 0.8520584840323201, + "flos": 561880737792.0, + "grad_norm": 0.0607366331567848, + "language_loss": 0.79741931, + "learning_rate": 5.630692048472363e-05, + "loss": 0.80809164, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.28930664, + "step": 4429, + "time_per_iteration": 2.685030698776245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_mlp": 1.04329574, + "epoch": 0.8522508657175837, + "flos": 526793822208.0, + "grad_norm": 0.06567497707339605, + "language_loss": 0.78606403, + "learning_rate": 5.61633772363489e-05, + "loss": 0.7967785, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.28198242, + "step": 4430, + "time_per_iteration": 2.594611644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063898, + "balance_loss_mlp": 1.03497767, + "epoch": 0.8524432474028473, + "flos": 498875618304.0, + "grad_norm": 0.05326350302130372, + "language_loss": 0.80760658, + "learning_rate": 5.602000630063298e-05, + "loss": 0.81824553, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.2890625, + "step": 4431, + "time_per_iteration": 2.5856552124023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106631, + "balance_loss_mlp": 1.03834355, + "epoch": 0.8526356290881109, + "flos": 421089325056.0, + "grad_norm": 0.073571565136352, + "language_loss": 0.79417092, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80483401, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.27954102, + "step": 4432, + "time_per_iteration": 2.480302095413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069706, + "balance_loss_mlp": 1.0413816, + "epoch": 0.8528280107733743, + "flos": 507078807552.0, + "grad_norm": 0.05899053033855359, + "language_loss": 0.80417913, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.8148762, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.28320312, + "step": 4433, + "time_per_iteration": 2.5961601734161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067688, + "balance_loss_mlp": 1.03950715, + "epoch": 0.8530203924586379, + "flos": 445663987200.0, + "grad_norm": 0.07402673493705796, + "language_loss": 0.82934564, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.84002256, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.28198242, + "step": 4434, + "time_per_iteration": 2.504897356033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067566, + "balance_loss_mlp": 1.03945613, + "epoch": 0.8532127741439015, + "flos": 657452712960.0, + "grad_norm": 0.06775200512771863, + "language_loss": 0.83294642, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84362209, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.28100586, + "step": 4435, + "time_per_iteration": 2.8722753524780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060812, + "balance_loss_mlp": 1.03201151, + "epoch": 0.8534051558291651, + "flos": 535750682112.0, + "grad_norm": 0.049124463523354554, + "language_loss": 0.83115995, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84176803, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.2878418, + "step": 4436, + "time_per_iteration": 2.7586135864257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069476, + "balance_loss_mlp": 1.04081857, + "epoch": 0.8535975375144286, + "flos": 532741791744.0, + "grad_norm": 0.07253609135717012, + "language_loss": 0.78917527, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.79987001, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.28637695, + "step": 4437, + "time_per_iteration": 2.665250301361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064267, + "balance_loss_mlp": 1.03591907, + "epoch": 0.8537899191996922, + "flos": 573861044736.0, + "grad_norm": 0.06315024185021119, + "language_loss": 0.82323515, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83387786, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.28344727, + "step": 4438, + "time_per_iteration": 2.6837167739868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068421, + "balance_loss_mlp": 1.04019177, + "epoch": 0.8539823008849557, + "flos": 464759161344.0, + "grad_norm": 0.0810478140018265, + "language_loss": 0.83188379, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84256798, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.28271484, + "step": 4439, + "time_per_iteration": 2.6771581172943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065679, + "balance_loss_mlp": 1.03687835, + "epoch": 0.8541746825702193, + "flos": 554441393664.0, + "grad_norm": 0.05580742758143019, + "language_loss": 0.8114894, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82214624, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.2878418, + "step": 4440, + "time_per_iteration": 2.703986644744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066517, + "balance_loss_mlp": 1.0385263, + "epoch": 0.8543670642554829, + "flos": 546101402112.0, + "grad_norm": 0.07237493250834019, + "language_loss": 0.77604347, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78670859, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.28027344, + "step": 4441, + "time_per_iteration": 2.809252977371216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106809, + "balance_loss_mlp": 1.03988481, + "epoch": 0.8545594459407464, + "flos": 511766747136.0, + "grad_norm": 0.05090580444418766, + "language_loss": 0.82180196, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83248281, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.28222656, + "step": 4442, + "time_per_iteration": 2.6570944786071777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069096, + "balance_loss_mlp": 1.04098618, + "epoch": 0.85475182762601, + "flos": 420961287168.0, + "grad_norm": 0.060096294700318055, + "language_loss": 0.81646609, + "learning_rate": 5.431301565318786e-05, + "loss": 0.82715702, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.28149414, + "step": 4443, + "time_per_iteration": 2.5243723392486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065725, + "balance_loss_mlp": 1.03792512, + "epoch": 0.8549442093112736, + "flos": 389222295552.0, + "grad_norm": 0.06469608643018868, + "language_loss": 0.773826, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78448325, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.27807617, + "step": 4444, + "time_per_iteration": 2.51920223236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_mlp": 1.03558922, + "epoch": 0.8551365909965372, + "flos": 605620385280.0, + "grad_norm": 0.059194132325457664, + "language_loss": 0.79776013, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80839705, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.28125, + "step": 4445, + "time_per_iteration": 2.801943063735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066012, + "balance_loss_mlp": 1.03671026, + "epoch": 0.8553289726818007, + "flos": 503912765952.0, + "grad_norm": 0.05844778654273943, + "language_loss": 0.78704023, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.79770029, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.29272461, + "step": 4446, + "time_per_iteration": 2.6274378299713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067948, + "balance_loss_mlp": 1.03914738, + "epoch": 0.8555213543670642, + "flos": 556735901184.0, + "grad_norm": 0.0545348209619519, + "language_loss": 0.759287, + "learning_rate": 5.374955106561324e-05, + "loss": 0.76996648, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.28808594, + "step": 4447, + "time_per_iteration": 2.781522274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066487, + "balance_loss_mlp": 1.03852105, + "epoch": 0.8557137360523278, + "flos": 547843060224.0, + "grad_norm": 0.05508059918721569, + "language_loss": 0.74790716, + "learning_rate": 5.360911790663775e-05, + "loss": 0.7585721, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.2800293, + "step": 4448, + "time_per_iteration": 2.681140184402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106353, + "balance_loss_mlp": 1.03518176, + "epoch": 0.8559061177375914, + "flos": 727853506560.0, + "grad_norm": 0.05884214790792896, + "language_loss": 0.78717124, + "learning_rate": 5.346885805197238e-05, + "loss": 0.7978065, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.28369141, + "step": 4449, + "time_per_iteration": 3.0732901096343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068527, + "balance_loss_mlp": 1.03967822, + "epoch": 0.856098499422855, + "flos": 535608087552.0, + "grad_norm": 0.06758405280159155, + "language_loss": 0.82919681, + "learning_rate": 5.332877155607085e-05, + "loss": 0.83988202, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.28857422, + "step": 4450, + "time_per_iteration": 2.658113479614258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071731, + "balance_loss_mlp": 1.04352653, + "epoch": 0.8562908811081185, + "flos": 573388180992.0, + "grad_norm": 0.06293317417138165, + "language_loss": 0.83193231, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84264964, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.28222656, + "step": 4451, + "time_per_iteration": 2.7371931076049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106554, + "balance_loss_mlp": 1.03700137, + "epoch": 0.856483262793382, + "flos": 781391024640.0, + "grad_norm": 0.06311736302267067, + "language_loss": 0.80342978, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81408519, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.28564453, + "step": 4452, + "time_per_iteration": 3.095228433609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106368, + "balance_loss_mlp": 1.03533196, + "epoch": 0.8566756444786456, + "flos": 455585513472.0, + "grad_norm": 0.057168939084114495, + "language_loss": 0.84728843, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85792524, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.28344727, + "step": 4453, + "time_per_iteration": 2.595768690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068236, + "balance_loss_mlp": 1.04072213, + "epoch": 0.8568680261639092, + "flos": 449150123520.0, + "grad_norm": 0.058366089298651294, + "language_loss": 0.8424089, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85309124, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.27514648, + "step": 4454, + "time_per_iteration": 2.5411229133605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068899, + "balance_loss_mlp": 1.04107571, + "epoch": 0.8570604078491728, + "flos": 479736774144.0, + "grad_norm": 0.07223117728599122, + "language_loss": 0.82632047, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83700949, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.27856445, + "step": 4455, + "time_per_iteration": 2.5366690158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_mlp": 1.03765273, + "epoch": 0.8572527895344363, + "flos": 505695121920.0, + "grad_norm": 0.061147295474926186, + "language_loss": 0.84813732, + "learning_rate": 5.249189615562627e-05, + "loss": 0.85880041, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.28613281, + "step": 4456, + "time_per_iteration": 2.5954558849334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065891, + "balance_loss_mlp": 1.03771043, + "epoch": 0.8574451712196999, + "flos": 786688630272.0, + "grad_norm": 0.05061557722226465, + "language_loss": 0.83000439, + "learning_rate": 5.235302469011905e-05, + "loss": 0.84066331, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.28222656, + "step": 4457, + "time_per_iteration": 3.1139042377471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065238, + "balance_loss_mlp": 1.03629398, + "epoch": 0.8576375529049635, + "flos": 508980436992.0, + "grad_norm": 0.05994421710631203, + "language_loss": 0.75134158, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.761994, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.28881836, + "step": 4458, + "time_per_iteration": 2.6730198860168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007078, + "balance_loss_mlp": 0.99591976, + "epoch": 0.857829934590227, + "flos": 1459996130304.0, + "grad_norm": 0.007822702191887595, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85774368, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.11181641, + "step": 4459, + "time_per_iteration": 4.979666233062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067045, + "balance_loss_mlp": 1.03779101, + "epoch": 0.8580223162754905, + "flos": 479057296896.0, + "grad_norm": 0.05274398336577564, + "language_loss": 0.89012241, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90079284, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.29223633, + "step": 4460, + "time_per_iteration": 2.6836555004119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067615, + "balance_loss_mlp": 1.03931475, + "epoch": 0.8582146979607541, + "flos": 705926942208.0, + "grad_norm": 0.07596315948303173, + "language_loss": 0.79420805, + "learning_rate": 5.179927728591227e-05, + "loss": 0.8048842, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.28295898, + "step": 4461, + "time_per_iteration": 2.853403329849243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066518, + "balance_loss_mlp": 1.03807497, + "epoch": 0.8584070796460177, + "flos": 764826084864.0, + "grad_norm": 0.06387758845808282, + "language_loss": 0.82548052, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83614576, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.28442383, + "step": 4462, + "time_per_iteration": 2.9871556758880615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070395, + "balance_loss_mlp": 1.04204726, + "epoch": 0.8585994613312813, + "flos": 586535385600.0, + "grad_norm": 0.057586538294609683, + "language_loss": 0.8564322, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86713612, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.28369141, + "step": 4463, + "time_per_iteration": 2.8135032653808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.03678679, + "epoch": 0.8587918430165449, + "flos": 607993468416.0, + "grad_norm": 0.05234014265771045, + "language_loss": 0.78836596, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79902256, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.28881836, + "step": 4464, + "time_per_iteration": 2.7817888259887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068079, + "balance_loss_mlp": 1.03963614, + "epoch": 0.8589842247018084, + "flos": 588710619648.0, + "grad_norm": 0.06230218500152689, + "language_loss": 0.8085956, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81927645, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.28466797, + "step": 4465, + "time_per_iteration": 2.691600799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065653, + "balance_loss_mlp": 1.03768635, + "epoch": 0.8591766063870719, + "flos": 543609045504.0, + "grad_norm": 0.07971528973299408, + "language_loss": 0.78662705, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79728359, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.27978516, + "step": 4466, + "time_per_iteration": 2.647693157196045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106499, + "balance_loss_mlp": 1.03661847, + "epoch": 0.8593689880723355, + "flos": 493499437056.0, + "grad_norm": 0.057340189460979636, + "language_loss": 0.80966145, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.82031131, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.28369141, + "step": 4467, + "time_per_iteration": 2.670189619064331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070601, + "balance_loss_mlp": 1.04294395, + "epoch": 0.8595613697575991, + "flos": 533652613632.0, + "grad_norm": 0.07143678371041538, + "language_loss": 0.83760196, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84830797, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.27685547, + "step": 4468, + "time_per_iteration": 2.6147608757019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068362, + "balance_loss_mlp": 1.04034781, + "epoch": 0.8597537514428626, + "flos": 617347588608.0, + "grad_norm": 0.061866552118211966, + "language_loss": 0.75730455, + "learning_rate": 5.070013822961328e-05, + "loss": 0.7679882, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.28027344, + "step": 4469, + "time_per_iteration": 2.7232584953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.03580678, + "epoch": 0.8599461331281262, + "flos": 608450365440.0, + "grad_norm": 0.05685660271928497, + "language_loss": 0.83694613, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84758651, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.2824707, + "step": 4470, + "time_per_iteration": 2.777681827545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068517, + "balance_loss_mlp": 1.04009736, + "epoch": 0.8601385148133898, + "flos": 550979988480.0, + "grad_norm": 0.05506266431023708, + "language_loss": 0.82577848, + "learning_rate": 5.042709673802786e-05, + "loss": 0.83646369, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.28417969, + "step": 4471, + "time_per_iteration": 2.6651957035064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106752, + "balance_loss_mlp": 1.03836131, + "epoch": 0.8603308964986534, + "flos": 580907510784.0, + "grad_norm": 0.06361138287055206, + "language_loss": 0.8119573, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.82263255, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.29125977, + "step": 4472, + "time_per_iteration": 2.867305278778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068349, + "balance_loss_mlp": 1.04002476, + "epoch": 0.8605232781839169, + "flos": 628731376128.0, + "grad_norm": 0.061969617336128574, + "language_loss": 0.75447845, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76516187, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.28344727, + "step": 4473, + "time_per_iteration": 2.791969060897827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064299, + "balance_loss_mlp": 1.03607023, + "epoch": 0.8607156598691804, + "flos": 467904854016.0, + "grad_norm": 0.06996386665919671, + "language_loss": 0.77089655, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78153956, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.2824707, + "step": 4474, + "time_per_iteration": 2.5247669219970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064708, + "balance_loss_mlp": 1.03655124, + "epoch": 0.860908041554444, + "flos": 488142194688.0, + "grad_norm": 0.07527102079674898, + "language_loss": 0.82489771, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83554482, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.28149414, + "step": 4475, + "time_per_iteration": 2.6851634979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067248, + "balance_loss_mlp": 1.03987718, + "epoch": 0.8611004232397076, + "flos": 591827198976.0, + "grad_norm": 0.06066793633900129, + "language_loss": 0.80281663, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81348914, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.27392578, + "step": 4476, + "time_per_iteration": 2.7129712104797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068054, + "balance_loss_mlp": 1.04018247, + "epoch": 0.8612928049249712, + "flos": 773857138176.0, + "grad_norm": 0.055176333782017764, + "language_loss": 0.85914743, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.86982793, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.27905273, + "step": 4477, + "time_per_iteration": 3.055014133453369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067412, + "balance_loss_mlp": 1.03868282, + "epoch": 0.8614851866102347, + "flos": 537291518976.0, + "grad_norm": 0.06049058254958562, + "language_loss": 0.82140207, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83207619, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.28710938, + "step": 4478, + "time_per_iteration": 2.6869184970855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066452, + "balance_loss_mlp": 1.03889072, + "epoch": 0.8616775682954982, + "flos": 565647681024.0, + "grad_norm": 0.08549280129733618, + "language_loss": 0.79003942, + "learning_rate": 4.934191658211729e-05, + "loss": 0.800704, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.27587891, + "step": 4479, + "time_per_iteration": 2.6531260013580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065181, + "balance_loss_mlp": 1.03638005, + "epoch": 0.8618699499807618, + "flos": 481351804416.0, + "grad_norm": 0.11855450332692621, + "language_loss": 0.81331623, + "learning_rate": 4.92070558355221e-05, + "loss": 0.82396805, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.2878418, + "step": 4480, + "time_per_iteration": 2.6510956287384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066433, + "balance_loss_mlp": 1.03684497, + "epoch": 0.8620623316660254, + "flos": 649214618112.0, + "grad_norm": 0.07320616066460611, + "language_loss": 0.74202549, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75268984, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.2956543, + "step": 4481, + "time_per_iteration": 2.761094331741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065154, + "balance_loss_mlp": 1.03706789, + "epoch": 0.862254713351289, + "flos": 751457710080.0, + "grad_norm": 0.06277275556700077, + "language_loss": 0.8580991, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86875063, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.28100586, + "step": 4482, + "time_per_iteration": 2.967822790145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069099, + "balance_loss_mlp": 1.03998828, + "epoch": 0.8624470950365525, + "flos": 841147144704.0, + "grad_norm": 0.06010002710742802, + "language_loss": 0.77420175, + "learning_rate": 4.880352388488024e-05, + "loss": 0.78489274, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.29101562, + "step": 4483, + "time_per_iteration": 3.2577385902404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067825, + "balance_loss_mlp": 1.03957295, + "epoch": 0.8626394767218161, + "flos": 754470982656.0, + "grad_norm": 0.07300953897576297, + "language_loss": 0.82941705, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84009528, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.28222656, + "step": 4484, + "time_per_iteration": 2.9013171195983887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067464, + "balance_loss_mlp": 1.03856742, + "epoch": 0.8628318584070797, + "flos": 703268669952.0, + "grad_norm": 0.06168064749637158, + "language_loss": 0.82346129, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83413589, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.28881836, + "step": 4485, + "time_per_iteration": 2.921997308731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068391, + "balance_loss_mlp": 1.03954196, + "epoch": 0.8630242400923432, + "flos": 471006876672.0, + "grad_norm": 0.061195678734605755, + "language_loss": 0.77577496, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78645885, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.28808594, + "step": 4486, + "time_per_iteration": 2.5501646995544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067068, + "balance_loss_mlp": 1.0388155, + "epoch": 0.8632166217776067, + "flos": 963965200896.0, + "grad_norm": 0.07614172176482971, + "language_loss": 0.77287424, + "learning_rate": 4.826793390639783e-05, + "loss": 0.7835449, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.2824707, + "step": 4487, + "time_per_iteration": 3.2161014080047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_mlp": 1.03665614, + "epoch": 0.8634090034628703, + "flos": 767583281664.0, + "grad_norm": 0.06353304542331387, + "language_loss": 0.78799319, + "learning_rate": 4.813447472684246e-05, + "loss": 0.79864818, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.28833008, + "step": 4488, + "time_per_iteration": 2.9450225830078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065894, + "balance_loss_mlp": 1.03697419, + "epoch": 0.8636013851481339, + "flos": 520310380032.0, + "grad_norm": 0.06251575685184195, + "language_loss": 0.82971573, + "learning_rate": 4.800119097704214e-05, + "loss": 0.84037471, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.28881836, + "step": 4489, + "time_per_iteration": 2.740370512008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067401, + "balance_loss_mlp": 1.03917265, + "epoch": 0.8637937668333975, + "flos": 631858129920.0, + "grad_norm": 0.06333852042335102, + "language_loss": 0.80451763, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81519163, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.28198242, + "step": 4490, + "time_per_iteration": 2.7359256744384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068762, + "balance_loss_mlp": 1.04010427, + "epoch": 0.8639861485186611, + "flos": 855739233792.0, + "grad_norm": 0.05047353967061317, + "language_loss": 0.76060426, + "learning_rate": 4.773514997362e-05, + "loss": 0.77129185, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.28662109, + "step": 4491, + "time_per_iteration": 3.0797441005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071015, + "balance_loss_mlp": 1.04261971, + "epoch": 0.8641785302039245, + "flos": 481017153024.0, + "grad_norm": 0.05674318342180607, + "language_loss": 0.77455688, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.785267, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.28417969, + "step": 4492, + "time_per_iteration": 2.520038366317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_mlp": 1.04135144, + "epoch": 0.8643709118891881, + "flos": 504385629696.0, + "grad_norm": 0.06254727528350278, + "language_loss": 0.80063522, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81133652, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.2878418, + "step": 4493, + "time_per_iteration": 2.5938947200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065619, + "balance_loss_mlp": 1.03712773, + "epoch": 0.8645632935744517, + "flos": 552074102784.0, + "grad_norm": 0.055240372629072394, + "language_loss": 0.82212245, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83277869, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.28466797, + "step": 4494, + "time_per_iteration": 2.77341365814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066304, + "balance_loss_mlp": 1.03869498, + "epoch": 0.8647556752597153, + "flos": 524489140224.0, + "grad_norm": 0.15546869391129756, + "language_loss": 0.84280682, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.85346985, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.27636719, + "step": 4495, + "time_per_iteration": 2.574237108230591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070237, + "balance_loss_mlp": 1.04196072, + "epoch": 0.8649480569449788, + "flos": 787403013120.0, + "grad_norm": 0.05684902230614366, + "language_loss": 0.81967145, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83037388, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.28271484, + "step": 4496, + "time_per_iteration": 3.0772690773010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066128, + "balance_loss_mlp": 1.03770816, + "epoch": 0.8651404386302424, + "flos": 763531149312.0, + "grad_norm": 0.05956401155270589, + "language_loss": 0.76680404, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77746534, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.28417969, + "step": 4497, + "time_per_iteration": 3.0376369953155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064599, + "balance_loss_mlp": 1.03603673, + "epoch": 0.865332820315506, + "flos": 539620932096.0, + "grad_norm": 0.05886756519779109, + "language_loss": 0.82413983, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83478582, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.28564453, + "step": 4498, + "time_per_iteration": 2.8252370357513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004967, + "balance_loss_mlp": 0.99366641, + "epoch": 0.8655252020007695, + "flos": 1475874390528.0, + "grad_norm": 0.009480995024256391, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80179417, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.11279297, + "step": 4499, + "time_per_iteration": 4.7803168296813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066203, + "balance_loss_mlp": 1.03714013, + "epoch": 0.8657175836860331, + "flos": 517094876160.0, + "grad_norm": 0.05771110198912223, + "language_loss": 0.82750368, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.83816576, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.29052734, + "step": 4500, + "time_per_iteration": 2.737542152404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067037, + "balance_loss_mlp": 1.03892779, + "epoch": 0.8659099653712966, + "flos": 590247074304.0, + "grad_norm": 0.05908664540109528, + "language_loss": 0.80244732, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81311762, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.28149414, + "step": 4501, + "time_per_iteration": 2.7165422439575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064384, + "balance_loss_mlp": 1.03613114, + "epoch": 0.8661023470565602, + "flos": 590168498688.0, + "grad_norm": 0.06739029778355735, + "language_loss": 0.87976968, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89041352, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.28271484, + "step": 4502, + "time_per_iteration": 2.880788803100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062872, + "balance_loss_mlp": 1.03440487, + "epoch": 0.8662947287418238, + "flos": 567405305856.0, + "grad_norm": 0.04997855335218525, + "language_loss": 0.79264534, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80327404, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.28491211, + "step": 4503, + "time_per_iteration": 2.7816219329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065958, + "balance_loss_mlp": 1.03703749, + "epoch": 0.8664871104270874, + "flos": 515661728256.0, + "grad_norm": 0.046072741879525626, + "language_loss": 0.82059586, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.83125544, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.2890625, + "step": 4504, + "time_per_iteration": 2.8134889602661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068809, + "balance_loss_mlp": 1.04010344, + "epoch": 0.866679492112351, + "flos": 556973038080.0, + "grad_norm": 0.052643351801927495, + "language_loss": 0.78038937, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79107749, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.28710938, + "step": 4505, + "time_per_iteration": 2.845855712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03399956, + "epoch": 0.8668718737976144, + "flos": 722145646080.0, + "grad_norm": 0.07179310361545532, + "language_loss": 0.81647635, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82710177, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.28515625, + "step": 4506, + "time_per_iteration": 2.9011623859405518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065183, + "balance_loss_mlp": 1.03633463, + "epoch": 0.867064255482878, + "flos": 600424676352.0, + "grad_norm": 0.07537486330186009, + "language_loss": 0.84679854, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85745037, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.28833008, + "step": 4507, + "time_per_iteration": 2.6858415603637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059998, + "balance_loss_mlp": 1.03184044, + "epoch": 0.8672566371681416, + "flos": 803177966592.0, + "grad_norm": 0.05593310912213693, + "language_loss": 0.76031673, + "learning_rate": 4.550219979745529e-05, + "loss": 0.7709167, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.28125, + "step": 4508, + "time_per_iteration": 3.0288636684417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_mlp": 1.03439939, + "epoch": 0.8674490188534052, + "flos": 627072675840.0, + "grad_norm": 0.06601583141232006, + "language_loss": 0.83780628, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84843922, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.2890625, + "step": 4509, + "time_per_iteration": 2.739122152328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106569, + "balance_loss_mlp": 1.0366981, + "epoch": 0.8676414005386687, + "flos": 727489741824.0, + "grad_norm": 0.058760625067754736, + "language_loss": 0.86417365, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87483048, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.28979492, + "step": 4510, + "time_per_iteration": 2.9747283458709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065116, + "balance_loss_mlp": 1.03671992, + "epoch": 0.8678337822239323, + "flos": 539676186624.0, + "grad_norm": 0.06474660971141381, + "language_loss": 0.80936235, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.82001352, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.28393555, + "step": 4511, + "time_per_iteration": 2.756804943084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064399, + "balance_loss_mlp": 1.03576529, + "epoch": 0.8680261639091958, + "flos": 507270864384.0, + "grad_norm": 0.06943904903889057, + "language_loss": 0.79382515, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80446917, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.28613281, + "step": 4512, + "time_per_iteration": 2.5936288833618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066971, + "balance_loss_mlp": 1.03824139, + "epoch": 0.8682185455944594, + "flos": 486871990272.0, + "grad_norm": 0.05745344948747144, + "language_loss": 0.80955023, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.82021987, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.28710938, + "step": 4513, + "time_per_iteration": 2.6303818225860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061644, + "balance_loss_mlp": 1.03324854, + "epoch": 0.868410927279723, + "flos": 603413217792.0, + "grad_norm": 0.0600055800011045, + "language_loss": 0.80860162, + "learning_rate": 4.472626206030528e-05, + "loss": 0.8192181, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.28417969, + "step": 4514, + "time_per_iteration": 2.6981005668640137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065518, + "balance_loss_mlp": 1.03612089, + "epoch": 0.8686033089649865, + "flos": 1118552772096.0, + "grad_norm": 0.07859483635461387, + "language_loss": 0.8481617, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.85881692, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.29370117, + "step": 4515, + "time_per_iteration": 3.3843672275543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106426, + "balance_loss_mlp": 1.03491068, + "epoch": 0.8687956906502501, + "flos": 567750131712.0, + "grad_norm": 0.06750496140695705, + "language_loss": 0.83204174, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84268439, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.29321289, + "step": 4516, + "time_per_iteration": 2.724592447280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066807, + "balance_loss_mlp": 1.03798199, + "epoch": 0.8689880723355137, + "flos": 544071734784.0, + "grad_norm": 0.055889262061819295, + "language_loss": 0.8429358, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85360384, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.28833008, + "step": 4517, + "time_per_iteration": 2.689141035079956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_mlp": 1.0338918, + "epoch": 0.8691804540207773, + "flos": 457185987072.0, + "grad_norm": 0.05116361101584782, + "language_loss": 0.86430299, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.87492657, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.28466797, + "step": 4518, + "time_per_iteration": 2.6708133220672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067852, + "balance_loss_mlp": 1.03909898, + "epoch": 0.8693728357060407, + "flos": 591591472128.0, + "grad_norm": 0.055626041012955325, + "language_loss": 0.79863721, + "learning_rate": 4.40845075221456e-05, + "loss": 0.80931574, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.28735352, + "step": 4519, + "time_per_iteration": 2.6947073936462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_mlp": 1.03498292, + "epoch": 0.8695652173913043, + "flos": 679949655552.0, + "grad_norm": 0.0650046136300286, + "language_loss": 0.79432595, + "learning_rate": 4.395668742181164e-05, + "loss": 0.8049624, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.28662109, + "step": 4520, + "time_per_iteration": 2.923346519470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065559, + "balance_loss_mlp": 1.03740191, + "epoch": 0.8697575990765679, + "flos": 492120133632.0, + "grad_norm": 0.06406228380921414, + "language_loss": 0.78534073, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79599631, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.28149414, + "step": 4521, + "time_per_iteration": 2.5783751010894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066142, + "balance_loss_mlp": 1.03819942, + "epoch": 0.8699499807618315, + "flos": 526690515456.0, + "grad_norm": 0.06561086282942073, + "language_loss": 0.8186453, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82930666, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.27954102, + "step": 4522, + "time_per_iteration": 2.690821647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066904, + "balance_loss_mlp": 1.03915191, + "epoch": 0.8701423624470951, + "flos": 813981201408.0, + "grad_norm": 0.0550322760692221, + "language_loss": 0.79950142, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81017047, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.27783203, + "step": 4523, + "time_per_iteration": 3.158989191055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_mlp": 1.03633487, + "epoch": 0.8703347441323586, + "flos": 556519113216.0, + "grad_norm": 0.052059598956666925, + "language_loss": 0.88351029, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89416325, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.28979492, + "step": 4524, + "time_per_iteration": 2.662280559539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068125, + "balance_loss_mlp": 1.03982425, + "epoch": 0.8705271258176221, + "flos": 585151699968.0, + "grad_norm": 0.06832788170157987, + "language_loss": 0.84436864, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85504991, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.28295898, + "step": 4525, + "time_per_iteration": 2.825425863265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.03210771, + "epoch": 0.8707195075028857, + "flos": 668896137216.0, + "grad_norm": 0.05033137477703021, + "language_loss": 0.85244215, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86304605, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.28271484, + "step": 4526, + "time_per_iteration": 2.86771297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068825, + "balance_loss_mlp": 1.03997612, + "epoch": 0.8709118891881493, + "flos": 520122705408.0, + "grad_norm": 0.055440044956439165, + "language_loss": 0.83684516, + "learning_rate": 4.306690693781007e-05, + "loss": 0.8475334, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.28833008, + "step": 4527, + "time_per_iteration": 2.7739171981811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064105, + "balance_loss_mlp": 1.03594756, + "epoch": 0.8711042708734128, + "flos": 552944226816.0, + "grad_norm": 0.06369806188789202, + "language_loss": 0.8152144, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82585543, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.28149414, + "step": 4528, + "time_per_iteration": 2.632436513900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069291, + "balance_loss_mlp": 1.04079986, + "epoch": 0.8712966525586764, + "flos": 501933970944.0, + "grad_norm": 0.06599031197153508, + "language_loss": 0.82279682, + "learning_rate": 4.281427977823094e-05, + "loss": 0.83348972, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.28491211, + "step": 4529, + "time_per_iteration": 2.7143640518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03783274, + "epoch": 0.87148903424394, + "flos": 803739580416.0, + "grad_norm": 0.05606476399314808, + "language_loss": 0.73884034, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74950159, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.28320312, + "step": 4530, + "time_per_iteration": 3.0463168621063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068204, + "balance_loss_mlp": 1.03940248, + "epoch": 0.8716814159292036, + "flos": 773088910848.0, + "grad_norm": 0.04934837250946328, + "language_loss": 0.85875851, + "learning_rate": 4.256236259953489e-05, + "loss": 0.86944056, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.2878418, + "step": 4531, + "time_per_iteration": 3.0251410007476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068315, + "balance_loss_mlp": 1.03944206, + "epoch": 0.8718737976144671, + "flos": 486595565568.0, + "grad_norm": 0.0657223096896028, + "language_loss": 0.84869027, + "learning_rate": 4.243667037531468e-05, + "loss": 0.85937339, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.28857422, + "step": 4532, + "time_per_iteration": 2.6163856983184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061697, + "balance_loss_mlp": 1.03339648, + "epoch": 0.8720661792997306, + "flos": 583850972160.0, + "grad_norm": 0.05979867957502993, + "language_loss": 0.78658783, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79720485, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.28344727, + "step": 4533, + "time_per_iteration": 2.747187614440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100617, + "balance_loss_mlp": 0.99477404, + "epoch": 0.8722585609849942, + "flos": 1495180560384.0, + "grad_norm": 0.008421507852118055, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81972969, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.11376953, + "step": 4534, + "time_per_iteration": 4.8016557693481445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065965, + "balance_loss_mlp": 1.03764045, + "epoch": 0.8724509426702578, + "flos": 595885123584.0, + "grad_norm": 0.0532252497000328, + "language_loss": 0.86752987, + "learning_rate": 4.206065974853479e-05, + "loss": 0.87818944, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.28320312, + "step": 4535, + "time_per_iteration": 2.77604603767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_mlp": 1.03722727, + "epoch": 0.8726433243555214, + "flos": 443408767488.0, + "grad_norm": 0.3308825948130969, + "language_loss": 0.80913717, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81980032, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.29052734, + "step": 4536, + "time_per_iteration": 2.5680594444274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061952, + "balance_loss_mlp": 1.03381848, + "epoch": 0.8728357060407849, + "flos": 552919495680.0, + "grad_norm": 0.08036350588218866, + "language_loss": 0.82172877, + "learning_rate": 4.181087485534402e-05, + "loss": 0.83234823, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.28149414, + "step": 4537, + "time_per_iteration": 2.6538937091827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063406, + "balance_loss_mlp": 1.03527319, + "epoch": 0.8730280877260485, + "flos": 627506251776.0, + "grad_norm": 0.12203372842904991, + "language_loss": 0.78675759, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79739171, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.28149414, + "step": 4538, + "time_per_iteration": 2.832740306854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065695, + "balance_loss_mlp": 1.03718042, + "epoch": 0.873220469411312, + "flos": 535106110464.0, + "grad_norm": 0.0606145940241532, + "language_loss": 0.80030394, + "learning_rate": 4.156180150126143e-05, + "loss": 0.81096089, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.28540039, + "step": 4539, + "time_per_iteration": 2.7213377952575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065902, + "balance_loss_mlp": 1.03745842, + "epoch": 0.8734128510965756, + "flos": 561605723136.0, + "grad_norm": 0.07538210093780918, + "language_loss": 0.84122992, + "learning_rate": 4.143753177230242e-05, + "loss": 0.8518889, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.28442383, + "step": 4540, + "time_per_iteration": 2.6960151195526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_mlp": 1.03531361, + "epoch": 0.8736052327818392, + "flos": 686134761984.0, + "grad_norm": 0.05595611742808352, + "language_loss": 0.79501259, + "learning_rate": 4.131344007308224e-05, + "loss": 0.8056432, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.27783203, + "step": 4541, + "time_per_iteration": 3.0171802043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106403, + "balance_loss_mlp": 1.03544354, + "epoch": 0.8737976144671027, + "flos": 531384247296.0, + "grad_norm": 0.0683699884933183, + "language_loss": 0.81357133, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.8242116, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.28564453, + "step": 4542, + "time_per_iteration": 2.805901527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062407, + "balance_loss_mlp": 1.03434491, + "epoch": 0.8739899961523663, + "flos": 575308749312.0, + "grad_norm": 0.06249925001654303, + "language_loss": 0.81543392, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82605791, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.28051758, + "step": 4543, + "time_per_iteration": 2.851834774017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065841, + "balance_loss_mlp": 1.03773165, + "epoch": 0.8741823778376299, + "flos": 731009373696.0, + "grad_norm": 0.06977869245266767, + "language_loss": 0.76428318, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77494162, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.28125, + "step": 4544, + "time_per_iteration": 2.925771713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066026, + "balance_loss_mlp": 1.03803515, + "epoch": 0.8743747595228935, + "flos": 566795639808.0, + "grad_norm": 0.07306890014877584, + "language_loss": 0.83605403, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84671426, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.2800293, + "step": 4545, + "time_per_iteration": 2.7709672451019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065089, + "balance_loss_mlp": 1.03702736, + "epoch": 0.8745671412081569, + "flos": 493115323392.0, + "grad_norm": 0.06204561243136525, + "language_loss": 0.82155466, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83220559, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.28076172, + "step": 4546, + "time_per_iteration": 2.565824270248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063172, + "balance_loss_mlp": 1.03525329, + "epoch": 0.8747595228934205, + "flos": 523883856384.0, + "grad_norm": 0.05603700784394243, + "language_loss": 0.83347672, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84410846, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.27929688, + "step": 4547, + "time_per_iteration": 2.639770746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067003, + "balance_loss_mlp": 1.03803515, + "epoch": 0.8749519045786841, + "flos": 743999427072.0, + "grad_norm": 0.061070440855238696, + "language_loss": 0.79543126, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80610132, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.28955078, + "step": 4548, + "time_per_iteration": 3.0035946369171143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067764, + "balance_loss_mlp": 1.04046547, + "epoch": 0.8751442862639477, + "flos": 594003843072.0, + "grad_norm": 0.05314972905968755, + "language_loss": 0.79939222, + "learning_rate": 4.032712131660027e-05, + "loss": 0.8100698, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.2734375, + "step": 4549, + "time_per_iteration": 2.8230674266815186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072257, + "balance_loss_mlp": 1.04369426, + "epoch": 0.8753366679492113, + "flos": 496285747200.0, + "grad_norm": 0.05669282479345713, + "language_loss": 0.78479946, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79552203, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.28564453, + "step": 4550, + "time_per_iteration": 2.7423791885375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03803086, + "epoch": 0.8755290496344748, + "flos": 489619012608.0, + "grad_norm": 0.06250704180116129, + "language_loss": 0.81786513, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82852924, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.28344727, + "step": 4551, + "time_per_iteration": 2.567431688308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068494, + "balance_loss_mlp": 1.04014564, + "epoch": 0.8757214313197383, + "flos": 591557976576.0, + "grad_norm": 0.06565865323727363, + "language_loss": 0.81568277, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82636774, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.28344727, + "step": 4552, + "time_per_iteration": 2.8340234756469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_mlp": 1.03698468, + "epoch": 0.8759138130050019, + "flos": 976456249344.0, + "grad_norm": 0.06629709551141487, + "language_loss": 0.78052419, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.79117966, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.28540039, + "step": 4553, + "time_per_iteration": 3.2071568965911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065752, + "balance_loss_mlp": 1.0381906, + "epoch": 0.8761061946902655, + "flos": 802405357056.0, + "grad_norm": 0.05351335594773902, + "language_loss": 0.77677572, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78743327, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.27612305, + "step": 4554, + "time_per_iteration": 3.0603485107421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_mlp": 1.04000342, + "epoch": 0.8762985763755291, + "flos": 698158738944.0, + "grad_norm": 0.05819539441060988, + "language_loss": 0.74314624, + "learning_rate": 3.95948762596155e-05, + "loss": 0.75382471, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.27856445, + "step": 4555, + "time_per_iteration": 2.964545249938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010664, + "balance_loss_mlp": 1.03843403, + "epoch": 0.8764909580607926, + "flos": 629416645632.0, + "grad_norm": 0.057392192725221856, + "language_loss": 0.80310047, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81376451, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.27978516, + "step": 4556, + "time_per_iteration": 2.91851806640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106548, + "balance_loss_mlp": 1.03787112, + "epoch": 0.8766833397460562, + "flos": 481297959936.0, + "grad_norm": 0.05571794139590596, + "language_loss": 0.80284274, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81349754, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.27636719, + "step": 4557, + "time_per_iteration": 2.686192274093628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067446, + "balance_loss_mlp": 1.03878832, + "epoch": 0.8768757214313198, + "flos": 407514336768.0, + "grad_norm": 0.06835264680371789, + "language_loss": 0.78205043, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79272485, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.28662109, + "step": 4558, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106727, + "balance_loss_mlp": 1.03861248, + "epoch": 0.8770681031165833, + "flos": 582314517504.0, + "grad_norm": 0.057418492817462405, + "language_loss": 0.8179571, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.82862979, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.28637695, + "step": 4559, + "time_per_iteration": 2.6915462017059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069669, + "balance_loss_mlp": 1.04122567, + "epoch": 0.8772604848018468, + "flos": 508437762048.0, + "grad_norm": 0.05352883186200444, + "language_loss": 0.80551112, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81620783, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.28442383, + "step": 4560, + "time_per_iteration": 2.67244029045105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068433, + "balance_loss_mlp": 1.04027581, + "epoch": 0.8774528664871104, + "flos": 408617215488.0, + "grad_norm": 0.06358979412347537, + "language_loss": 0.84776622, + "learning_rate": 3.886906601970913e-05, + "loss": 0.85845059, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.28173828, + "step": 4561, + "time_per_iteration": 2.491192102432251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069485, + "balance_loss_mlp": 1.04161429, + "epoch": 0.877645248172374, + "flos": 500589573120.0, + "grad_norm": 0.06737162506432262, + "language_loss": 0.83147556, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84217036, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.27880859, + "step": 4562, + "time_per_iteration": 2.684629440307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066768, + "balance_loss_mlp": 1.03861117, + "epoch": 0.8778376298576376, + "flos": 632857701888.0, + "grad_norm": 0.059369689749274944, + "language_loss": 0.78097963, + "learning_rate": 3.862856098834189e-05, + "loss": 0.79164732, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.28173828, + "step": 4563, + "time_per_iteration": 2.8923280239105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072367, + "balance_loss_mlp": 1.04442441, + "epoch": 0.8780300115429012, + "flos": 533707868160.0, + "grad_norm": 0.05558389562769292, + "language_loss": 0.80053449, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81125814, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.27954102, + "step": 4564, + "time_per_iteration": 2.823686361312866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069649, + "balance_loss_mlp": 1.04175413, + "epoch": 0.8782223932281646, + "flos": 511411746816.0, + "grad_norm": 0.05672637464801372, + "language_loss": 0.7727713, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78346777, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.27929688, + "step": 4565, + "time_per_iteration": 2.5881996154785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010687, + "balance_loss_mlp": 1.04049492, + "epoch": 0.8784147749134282, + "flos": 780333225984.0, + "grad_norm": 0.05732086037838532, + "language_loss": 0.69910938, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70979643, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.28222656, + "step": 4566, + "time_per_iteration": 3.1945879459381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066133, + "balance_loss_mlp": 1.03730834, + "epoch": 0.8786071565986918, + "flos": 560738571264.0, + "grad_norm": 0.06580168201373691, + "language_loss": 0.75722879, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76789016, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.28808594, + "step": 4567, + "time_per_iteration": 2.7322754859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070458, + "balance_loss_mlp": 1.04337335, + "epoch": 0.8787995382839554, + "flos": 603148377600.0, + "grad_norm": 0.05172796640220285, + "language_loss": 0.77077734, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78148192, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.27148438, + "step": 4568, + "time_per_iteration": 2.776970386505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068428, + "balance_loss_mlp": 1.04062867, + "epoch": 0.8789919199692189, + "flos": 559970343936.0, + "grad_norm": 0.059324275843292064, + "language_loss": 0.84837639, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.85906065, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.27832031, + "step": 4569, + "time_per_iteration": 2.6863996982574463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065069, + "balance_loss_mlp": 1.03810334, + "epoch": 0.8791843016544825, + "flos": 538857086976.0, + "grad_norm": 0.07165107779737093, + "language_loss": 0.81886643, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.82951707, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.27026367, + "step": 4570, + "time_per_iteration": 2.611616611480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071174, + "balance_loss_mlp": 1.04273033, + "epoch": 0.8793766833397461, + "flos": 1008275226624.0, + "grad_norm": 0.0558494404755544, + "language_loss": 0.79366511, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80437684, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.28442383, + "step": 4571, + "time_per_iteration": 3.3873000144958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073047, + "balance_loss_mlp": 1.04481804, + "epoch": 0.8795690650250096, + "flos": 678320068608.0, + "grad_norm": 0.056063442839823466, + "language_loss": 0.80823278, + "learning_rate": 3.755516016623628e-05, + "loss": 0.81896329, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.28222656, + "step": 4572, + "time_per_iteration": 2.893048048019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066399, + "balance_loss_mlp": 1.03793228, + "epoch": 0.8797614467102732, + "flos": 453202255872.0, + "grad_norm": 0.06304464607757537, + "language_loss": 0.88333166, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.89399564, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.28442383, + "step": 4573, + "time_per_iteration": 2.5377988815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071187, + "balance_loss_mlp": 1.04367304, + "epoch": 0.8799538283955367, + "flos": 550649719296.0, + "grad_norm": 0.05552176218492619, + "language_loss": 0.84267652, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85338843, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.27563477, + "step": 4574, + "time_per_iteration": 2.6662659645080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068687, + "balance_loss_mlp": 1.04053009, + "epoch": 0.8801462100808003, + "flos": 807072947712.0, + "grad_norm": 0.05728401921436289, + "language_loss": 0.83839577, + "learning_rate": 3.720058989624681e-05, + "loss": 0.84908265, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.28173828, + "step": 4575, + "time_per_iteration": 3.076876640319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070952, + "balance_loss_mlp": 1.04296148, + "epoch": 0.8803385917660639, + "flos": 768366065664.0, + "grad_norm": 0.0517828102810866, + "language_loss": 0.84589469, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85660422, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.28027344, + "step": 4576, + "time_per_iteration": 2.9635534286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.04016733, + "epoch": 0.8805309734513275, + "flos": 567070654464.0, + "grad_norm": 0.053989075143044706, + "language_loss": 0.81054318, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82122689, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.28186035, + "step": 4577, + "time_per_iteration": 2.752592086791992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069025, + "balance_loss_mlp": 1.04008079, + "epoch": 0.880723355136591, + "flos": 679481174016.0, + "grad_norm": 0.06232126145742502, + "language_loss": 0.81594551, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82663572, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.28979492, + "step": 4578, + "time_per_iteration": 2.814424991607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107019, + "balance_loss_mlp": 1.04248548, + "epoch": 0.8809157368218545, + "flos": 565347935232.0, + "grad_norm": 0.0557636314762692, + "language_loss": 0.78824782, + "learning_rate": 3.673034519424734e-05, + "loss": 0.79894972, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.27734375, + "step": 4579, + "time_per_iteration": 2.785956382751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071039, + "balance_loss_mlp": 1.04309607, + "epoch": 0.8811081185071181, + "flos": 515153958912.0, + "grad_norm": 0.05030651493634772, + "language_loss": 0.75700289, + "learning_rate": 3.661323354789586e-05, + "loss": 0.76771331, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.27954102, + "step": 4580, + "time_per_iteration": 2.6824047565460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_mlp": 1.04019618, + "epoch": 0.8813005001923817, + "flos": 594067862016.0, + "grad_norm": 0.07015298891450013, + "language_loss": 0.8114329, + "learning_rate": 3.649630180424191e-05, + "loss": 0.82211691, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.28198242, + "step": 4581, + "time_per_iteration": 2.7086069583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062221, + "balance_loss_mlp": 1.03425419, + "epoch": 0.8814928818776453, + "flos": 666630743040.0, + "grad_norm": 0.05665802928284555, + "language_loss": 0.79123235, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80185449, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.27978516, + "step": 4582, + "time_per_iteration": 2.8371450901031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071423, + "balance_loss_mlp": 1.04393387, + "epoch": 0.8816852635629088, + "flos": 608873766912.0, + "grad_norm": 0.054118790146548024, + "language_loss": 0.8546508, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86536503, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.27514648, + "step": 4583, + "time_per_iteration": 2.717241048812866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067589, + "balance_loss_mlp": 1.03990829, + "epoch": 0.8818776452481724, + "flos": 480131062272.0, + "grad_norm": 0.05987245648604073, + "language_loss": 0.81967342, + "learning_rate": 3.614658644308572e-05, + "loss": 0.83034927, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.27709961, + "step": 4584, + "time_per_iteration": 2.6413609981536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071593, + "balance_loss_mlp": 1.04243433, + "epoch": 0.882070026933436, + "flos": 1044985936896.0, + "grad_norm": 0.05789017209637249, + "language_loss": 0.73687112, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74758708, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.29125977, + "step": 4585, + "time_per_iteration": 3.304685592651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_mlp": 1.03928089, + "epoch": 0.8822624086186995, + "flos": 474153979392.0, + "grad_norm": 0.054265855941406795, + "language_loss": 0.79589009, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80655658, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.27441406, + "step": 4586, + "time_per_iteration": 2.697514533996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_mlp": 1.04042411, + "epoch": 0.882454790303963, + "flos": 653725057536.0, + "grad_norm": 0.06096374939952472, + "language_loss": 0.81569088, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82637, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.27514648, + "step": 4587, + "time_per_iteration": 2.8024706840515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063916, + "balance_loss_mlp": 1.03544879, + "epoch": 0.8826471719892266, + "flos": 470081498112.0, + "grad_norm": 0.05869577114957185, + "language_loss": 0.78408635, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79472554, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.28442383, + "step": 4588, + "time_per_iteration": 2.578707695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068701, + "balance_loss_mlp": 1.04040098, + "epoch": 0.8828395536744902, + "flos": 468501373440.0, + "grad_norm": 0.05231035203284282, + "language_loss": 0.83738208, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84806907, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.28295898, + "step": 4589, + "time_per_iteration": 2.68972110748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065866, + "balance_loss_mlp": 1.03809047, + "epoch": 0.8830319353597538, + "flos": 721044177408.0, + "grad_norm": 0.08332250868993829, + "language_loss": 0.81341159, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82407022, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.27783203, + "step": 4590, + "time_per_iteration": 2.9407219886779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107042, + "balance_loss_mlp": 1.04295468, + "epoch": 0.8832243170450174, + "flos": 443049384960.0, + "grad_norm": 0.0678203863525127, + "language_loss": 0.81142139, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82212561, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.2746582, + "step": 4591, + "time_per_iteration": 2.568373680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067355, + "balance_loss_mlp": 1.03934026, + "epoch": 0.8834166987302808, + "flos": 566293662720.0, + "grad_norm": 0.06220789514953692, + "language_loss": 0.81893933, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.82961291, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.28027344, + "step": 4592, + "time_per_iteration": 2.756255626678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067482, + "balance_loss_mlp": 1.03975368, + "epoch": 0.8836090804155444, + "flos": 609022153728.0, + "grad_norm": 0.07096792150900852, + "language_loss": 0.81740928, + "learning_rate": 3.510716974532352e-05, + "loss": 0.82808411, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.27734375, + "step": 4593, + "time_per_iteration": 2.7616682052612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068676, + "balance_loss_mlp": 1.04020929, + "epoch": 0.883801462100808, + "flos": 556804302336.0, + "grad_norm": 0.06039187959757844, + "language_loss": 0.80636853, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81705528, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.28491211, + "step": 4594, + "time_per_iteration": 2.658634662628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068628, + "balance_loss_mlp": 1.03965974, + "epoch": 0.8839938437860716, + "flos": 515936742912.0, + "grad_norm": 0.07933366210250277, + "language_loss": 0.77274346, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78342974, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.28955078, + "step": 4595, + "time_per_iteration": 2.599109649658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064003, + "balance_loss_mlp": 1.03620315, + "epoch": 0.8841862254713351, + "flos": 713386635264.0, + "grad_norm": 0.06401274650303065, + "language_loss": 0.7867049, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79734492, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.27807617, + "step": 4596, + "time_per_iteration": 2.8454713821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067855, + "balance_loss_mlp": 1.04062724, + "epoch": 0.8843786071565987, + "flos": 833626404864.0, + "grad_norm": 0.07003048981837431, + "language_loss": 0.82647777, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8371563, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.27270508, + "step": 4597, + "time_per_iteration": 3.039944887161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069696, + "balance_loss_mlp": 1.04196787, + "epoch": 0.8845709888418622, + "flos": 656562240000.0, + "grad_norm": 0.04759555258989633, + "language_loss": 0.82870215, + "learning_rate": 3.453603099349462e-05, + "loss": 0.8393991, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.27734375, + "step": 4598, + "time_per_iteration": 2.924360513687134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_mlp": 1.03937411, + "epoch": 0.8847633705271258, + "flos": 523038463488.0, + "grad_norm": 0.0554469987198936, + "language_loss": 0.81217462, + "learning_rate": 3.442234519350823e-05, + "loss": 0.82284564, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.27734375, + "step": 4599, + "time_per_iteration": 2.7385177612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069138, + "balance_loss_mlp": 1.04188693, + "epoch": 0.8849557522123894, + "flos": 548330480640.0, + "grad_norm": 0.0620233262866808, + "language_loss": 0.84279031, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85348165, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.27246094, + "step": 4600, + "time_per_iteration": 2.678050994873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069344, + "balance_loss_mlp": 1.04185414, + "epoch": 0.8851481338976529, + "flos": 622070433792.0, + "grad_norm": 0.051582270147677196, + "language_loss": 0.83688784, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84758127, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.27563477, + "step": 4601, + "time_per_iteration": 2.8480563163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067523, + "balance_loss_mlp": 1.03929448, + "epoch": 0.8853405155829165, + "flos": 444123150336.0, + "grad_norm": 0.056068366837892174, + "language_loss": 0.80678725, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81746256, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.2824707, + "step": 4602, + "time_per_iteration": 2.5683131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065058, + "balance_loss_mlp": 1.03682971, + "epoch": 0.8855328972681801, + "flos": 730152396288.0, + "grad_norm": 0.05740540609560926, + "language_loss": 0.77796984, + "learning_rate": 3.396940996663683e-05, + "loss": 0.78862035, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.28222656, + "step": 4603, + "time_per_iteration": 2.897857666015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067098, + "balance_loss_mlp": 1.03936982, + "epoch": 0.8857252789534437, + "flos": 487132448256.0, + "grad_norm": 0.058129014822259635, + "language_loss": 0.79058081, + "learning_rate": 3.385662837299375e-05, + "loss": 0.80125177, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.27758789, + "step": 4604, + "time_per_iteration": 2.5698628425598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070359, + "balance_loss_mlp": 1.04284549, + "epoch": 0.8859176606387072, + "flos": 508290785280.0, + "grad_norm": 0.05786101716363267, + "language_loss": 0.81376195, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82446557, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.27563477, + "step": 4605, + "time_per_iteration": 2.6911301612854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069989, + "balance_loss_mlp": 1.04142654, + "epoch": 0.8861100423239707, + "flos": 516370318848.0, + "grad_norm": 0.054307106950923195, + "language_loss": 0.85590959, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86660945, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.28588867, + "step": 4606, + "time_per_iteration": 2.6767466068267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107098, + "balance_loss_mlp": 1.04394376, + "epoch": 0.8863024240092343, + "flos": 626692944384.0, + "grad_norm": 0.0544909967817947, + "language_loss": 0.79524022, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80595005, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.27075195, + "step": 4607, + "time_per_iteration": 2.716878652572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069998, + "balance_loss_mlp": 1.0418644, + "epoch": 0.8864948056944979, + "flos": 766564770816.0, + "grad_norm": 0.1476541452919149, + "language_loss": 0.83357704, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84427702, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.28173828, + "step": 4608, + "time_per_iteration": 2.9715864658355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021373, + "balance_loss_mlp": 1.00988162, + "epoch": 0.8866871873797615, + "flos": 1501500907008.0, + "grad_norm": 0.01167151488770453, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.79852331, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.11474609, + "step": 4609, + "time_per_iteration": 4.834856748580933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068118, + "balance_loss_mlp": 1.04015195, + "epoch": 0.886879569065025, + "flos": 811164367872.0, + "grad_norm": 0.08488688908533946, + "language_loss": 0.81698787, + "learning_rate": 3.3183740769755e-05, + "loss": 0.82766908, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.27978516, + "step": 4610, + "time_per_iteration": 3.034174680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020022, + "balance_loss_mlp": 1.0085299, + "epoch": 0.8870719507502886, + "flos": 1581994934784.0, + "grad_norm": 0.010974826258400936, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.77930856, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.11474609, + "step": 4611, + "time_per_iteration": 4.9730494022369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069377, + "balance_loss_mlp": 1.04114866, + "epoch": 0.8872643324355521, + "flos": 633743792640.0, + "grad_norm": 0.0611353937593657, + "language_loss": 0.75024319, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76093698, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.2824707, + "step": 4612, + "time_per_iteration": 2.746696710586548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069726, + "balance_loss_mlp": 1.04214144, + "epoch": 0.8874567141208157, + "flos": 535498988544.0, + "grad_norm": 0.06235030961125674, + "language_loss": 0.82855523, + "learning_rate": 3.284974304209532e-05, + "loss": 0.83925247, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.27636719, + "step": 4613, + "time_per_iteration": 2.637052536010742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067959, + "balance_loss_mlp": 1.03958726, + "epoch": 0.8876490958060793, + "flos": 1565700931584.0, + "grad_norm": 0.06402411256852786, + "language_loss": 0.7942912, + "learning_rate": 3.27387731362766e-05, + "loss": 0.8049708, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.28369141, + "step": 4614, + "time_per_iteration": 3.923633575439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.03921044, + "epoch": 0.8878414774913428, + "flos": 636343838208.0, + "grad_norm": 0.05096135201935837, + "language_loss": 0.85021508, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.86089039, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.28344727, + "step": 4615, + "time_per_iteration": 2.779921054840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063179, + "balance_loss_mlp": 1.03480697, + "epoch": 0.8880338591766064, + "flos": 496182440448.0, + "grad_norm": 0.06545341443379886, + "language_loss": 0.81585425, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82648605, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.28393555, + "step": 4616, + "time_per_iteration": 2.616635322570801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071314, + "balance_loss_mlp": 1.04289412, + "epoch": 0.88822624086187, + "flos": 542599299072.0, + "grad_norm": 0.057599266414533334, + "language_loss": 0.79628587, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.80699897, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.28393555, + "step": 4617, + "time_per_iteration": 2.6929566860198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065504, + "balance_loss_mlp": 1.03744173, + "epoch": 0.8884186225471336, + "flos": 551560541184.0, + "grad_norm": 0.06545285558813568, + "language_loss": 0.84187359, + "learning_rate": 3.229670801173418e-05, + "loss": 0.85252863, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.28076172, + "step": 4618, + "time_per_iteration": 2.689328908920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018165, + "balance_loss_mlp": 1.00662541, + "epoch": 0.888611004232397, + "flos": 1564417276416.0, + "grad_norm": 0.008722298990841466, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79530358, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.11523438, + "step": 4619, + "time_per_iteration": 5.016630172729492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071208, + "balance_loss_mlp": 1.04338467, + "epoch": 0.8888033859176606, + "flos": 766678252032.0, + "grad_norm": 0.0582454521799534, + "language_loss": 0.82567924, + "learning_rate": 3.207676474914301e-05, + "loss": 0.83639133, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.27856445, + "step": 4620, + "time_per_iteration": 3.0133860111236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.04040623, + "epoch": 0.8889957676029242, + "flos": 933727758336.0, + "grad_norm": 0.05884213021471634, + "language_loss": 0.83990335, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85058427, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.27758789, + "step": 4621, + "time_per_iteration": 3.167980670928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068617, + "balance_loss_mlp": 1.04069793, + "epoch": 0.8891881492881878, + "flos": 589317313536.0, + "grad_norm": 0.09170475766074285, + "language_loss": 0.81411701, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82480323, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.27954102, + "step": 4622, + "time_per_iteration": 2.7863264083862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067138, + "balance_loss_mlp": 1.03888595, + "epoch": 0.8893805309734514, + "flos": 540438621696.0, + "grad_norm": 0.06461743401993036, + "language_loss": 0.82403553, + "learning_rate": 3.174821244088466e-05, + "loss": 0.8347069, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.2824707, + "step": 4623, + "time_per_iteration": 2.731494903564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106871, + "balance_loss_mlp": 1.04057622, + "epoch": 0.8895729126587149, + "flos": 559827749376.0, + "grad_norm": 0.06194328064505052, + "language_loss": 0.81727606, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82796311, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.28173828, + "step": 4624, + "time_per_iteration": 2.7517242431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070888, + "balance_loss_mlp": 1.04284978, + "epoch": 0.8897652943439784, + "flos": 609873338880.0, + "grad_norm": 0.050549853414559504, + "language_loss": 0.8105303, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82123923, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.28015137, + "step": 4625, + "time_per_iteration": 2.775944948196411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068366, + "balance_loss_mlp": 1.0394454, + "epoch": 0.889957676029242, + "flos": 917455209984.0, + "grad_norm": 0.06186267612969521, + "language_loss": 0.7697686, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78045225, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.2890625, + "step": 4626, + "time_per_iteration": 3.221770763397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067678, + "balance_loss_mlp": 1.03944921, + "epoch": 0.8901500577145056, + "flos": 488452114944.0, + "grad_norm": 0.0559809171048545, + "language_loss": 0.80048203, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81115878, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.28222656, + "step": 4627, + "time_per_iteration": 2.563779354095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068508, + "balance_loss_mlp": 1.04042268, + "epoch": 0.8903424393997691, + "flos": 733332994560.0, + "grad_norm": 0.0592114438847255, + "language_loss": 0.80764806, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81833315, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.28125, + "step": 4628, + "time_per_iteration": 2.9863662719726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066862, + "balance_loss_mlp": 1.0390867, + "epoch": 0.8905348210850327, + "flos": 519546534912.0, + "grad_norm": 0.05729478499656057, + "language_loss": 0.81872827, + "learning_rate": 3.109601733496881e-05, + "loss": 0.8293969, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.27783203, + "step": 4629, + "time_per_iteration": 2.655174970626831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064198, + "balance_loss_mlp": 1.03639805, + "epoch": 0.8907272027702963, + "flos": 578672640000.0, + "grad_norm": 0.052285049581706246, + "language_loss": 0.79457366, + "learning_rate": 3.098795506144458e-05, + "loss": 0.8052156, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.27832031, + "step": 4630, + "time_per_iteration": 2.840730667114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067701, + "balance_loss_mlp": 1.04030657, + "epoch": 0.8909195844555599, + "flos": 893258869248.0, + "grad_norm": 0.059465272064999686, + "language_loss": 0.79709071, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80776775, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.27441406, + "step": 4631, + "time_per_iteration": 3.1187219619750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070472, + "balance_loss_mlp": 1.0419575, + "epoch": 0.8911119661408234, + "flos": 549596302848.0, + "grad_norm": 0.05514247139292472, + "language_loss": 0.84210968, + "learning_rate": 3.077237681615208e-05, + "loss": 0.85281444, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.28540039, + "step": 4632, + "time_per_iteration": 2.695281505584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070837, + "balance_loss_mlp": 1.04258442, + "epoch": 0.8913043478260869, + "flos": 480884732928.0, + "grad_norm": 0.07098805305903529, + "language_loss": 0.83367896, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84438735, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.2824707, + "step": 4633, + "time_per_iteration": 2.674928665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067506, + "balance_loss_mlp": 1.03934908, + "epoch": 0.8914967295113505, + "flos": 484317024768.0, + "grad_norm": 0.05060462387255462, + "language_loss": 0.85151595, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86219102, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.28149414, + "step": 4634, + "time_per_iteration": 2.6783857345581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069245, + "balance_loss_mlp": 1.04096866, + "epoch": 0.8916891111966141, + "flos": 445432642560.0, + "grad_norm": 0.05975693569548975, + "language_loss": 0.81291115, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82360363, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.28295898, + "step": 4635, + "time_per_iteration": 2.5965147018432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068237, + "balance_loss_mlp": 1.04027081, + "epoch": 0.8918814928818777, + "flos": 563751843840.0, + "grad_norm": 0.049564759273153264, + "language_loss": 0.78083277, + "learning_rate": 3.034340670283453e-05, + "loss": 0.79151511, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.27978516, + "step": 4636, + "time_per_iteration": 2.771043062210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.03971982, + "epoch": 0.8920738745671412, + "flos": 575672514048.0, + "grad_norm": 0.06662483831427483, + "language_loss": 0.80982053, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82049739, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.2800293, + "step": 4637, + "time_per_iteration": 2.6836137771606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067244, + "balance_loss_mlp": 1.04018307, + "epoch": 0.8922662562524047, + "flos": 619898171904.0, + "grad_norm": 0.055977453987363854, + "language_loss": 0.84088302, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.85155547, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.27124023, + "step": 4638, + "time_per_iteration": 2.7201523780822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066044, + "balance_loss_mlp": 1.0383637, + "epoch": 0.8924586379376683, + "flos": 583330056192.0, + "grad_norm": 0.06318368415584479, + "language_loss": 0.7920469, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80270731, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.27709961, + "step": 4639, + "time_per_iteration": 2.7709860801696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_mlp": 1.04509687, + "epoch": 0.8926510196229319, + "flos": 524922716160.0, + "grad_norm": 0.043960833871696636, + "language_loss": 0.81677014, + "learning_rate": 2.991735397786538e-05, + "loss": 0.827492, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.27148438, + "step": 4640, + "time_per_iteration": 2.8300883769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_mlp": 1.04016221, + "epoch": 0.8928434013081955, + "flos": 486428239872.0, + "grad_norm": 0.06172673252481555, + "language_loss": 0.80732042, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81800508, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.28320312, + "step": 4641, + "time_per_iteration": 2.5496692657470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015529, + "balance_loss_mlp": 1.00403714, + "epoch": 0.893035782993459, + "flos": 1447580837376.0, + "grad_norm": 0.007735138982934367, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.81346381, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.11474609, + "step": 4642, + "time_per_iteration": 4.7214789390563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071184, + "balance_loss_mlp": 1.04440916, + "epoch": 0.8932281646787226, + "flos": 611040236544.0, + "grad_norm": 0.06230768103154438, + "language_loss": 0.80826664, + "learning_rate": 2.95997305629786e-05, + "loss": 0.81897843, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.26794434, + "step": 4643, + "time_per_iteration": 2.776540756225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070751, + "balance_loss_mlp": 1.04316592, + "epoch": 0.8934205463639862, + "flos": 565494912000.0, + "grad_norm": 0.0560545196954126, + "language_loss": 0.84422594, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85493338, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.27636719, + "step": 4644, + "time_per_iteration": 2.64113450050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068593, + "balance_loss_mlp": 1.04084074, + "epoch": 0.8936129280492497, + "flos": 488181482496.0, + "grad_norm": 0.07046093085577981, + "language_loss": 0.77728665, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.78797263, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.27807617, + "step": 4645, + "time_per_iteration": 2.557194232940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070503, + "balance_loss_mlp": 1.04298949, + "epoch": 0.8938053097345132, + "flos": 886095949824.0, + "grad_norm": 0.05665952535342083, + "language_loss": 0.80841428, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81911927, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.27514648, + "step": 4646, + "time_per_iteration": 3.204979181289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069223, + "balance_loss_mlp": 1.04173374, + "epoch": 0.8939976914197768, + "flos": 592999888896.0, + "grad_norm": 0.057759153184874165, + "language_loss": 0.84277451, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85346675, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.27539062, + "step": 4647, + "time_per_iteration": 2.7343509197235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067768, + "balance_loss_mlp": 1.03944361, + "epoch": 0.8941900731050404, + "flos": 522983208960.0, + "grad_norm": 0.059380857059797024, + "language_loss": 0.80891001, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.81958771, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.28320312, + "step": 4648, + "time_per_iteration": 2.6367506980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066382, + "balance_loss_mlp": 1.03853464, + "epoch": 0.894382454790304, + "flos": 800247651840.0, + "grad_norm": 0.05523040639644092, + "language_loss": 0.81081837, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82148218, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.27856445, + "step": 4649, + "time_per_iteration": 3.030062198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065779, + "balance_loss_mlp": 1.03788459, + "epoch": 0.8945748364755676, + "flos": 478782282240.0, + "grad_norm": 0.06268111355606142, + "language_loss": 0.84919488, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85985267, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.27929688, + "step": 4650, + "time_per_iteration": 2.59285569190979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068569, + "balance_loss_mlp": 1.04065061, + "epoch": 0.894767218160831, + "flos": 508507573248.0, + "grad_norm": 0.07694794065746953, + "language_loss": 0.82904601, + "learning_rate": 2.876077330953042e-05, + "loss": 0.83973163, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.27929688, + "step": 4651, + "time_per_iteration": 2.6908295154571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070543, + "balance_loss_mlp": 1.04274344, + "epoch": 0.8949595998460946, + "flos": 685557181440.0, + "grad_norm": 0.05647102417455385, + "language_loss": 0.81656528, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82727075, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.27807617, + "step": 4652, + "time_per_iteration": 2.8482308387756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.03751302, + "epoch": 0.8951519815313582, + "flos": 799578349056.0, + "grad_norm": 0.057302160059149884, + "language_loss": 0.77321589, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78387833, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.28735352, + "step": 4653, + "time_per_iteration": 3.002678632736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066194, + "balance_loss_mlp": 1.0369159, + "epoch": 0.8953443632166218, + "flos": 666443068416.0, + "grad_norm": 0.059263332900696505, + "language_loss": 0.86105883, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87172079, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.29272461, + "step": 4654, + "time_per_iteration": 2.810746908187866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062957, + "balance_loss_mlp": 1.0353719, + "epoch": 0.8955367449018854, + "flos": 644670683136.0, + "grad_norm": 0.05592703355481017, + "language_loss": 0.83190131, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84253091, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.27636719, + "step": 4655, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067585, + "balance_loss_mlp": 1.03911805, + "epoch": 0.8957291265871489, + "flos": 808353326592.0, + "grad_norm": 0.06040680854300063, + "language_loss": 0.77388299, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78455889, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.28442383, + "step": 4656, + "time_per_iteration": 3.0514414310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066606, + "balance_loss_mlp": 1.0381391, + "epoch": 0.8959215082724125, + "flos": 518664826368.0, + "grad_norm": 0.05687998716555397, + "language_loss": 0.76916766, + "learning_rate": 2.813923817903391e-05, + "loss": 0.77983367, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.28515625, + "step": 4657, + "time_per_iteration": 2.6414825916290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_mlp": 1.03921711, + "epoch": 0.896113889957676, + "flos": 476669657088.0, + "grad_norm": 0.0528545629927777, + "language_loss": 0.77033144, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78100324, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.28027344, + "step": 4658, + "time_per_iteration": 2.6311020851135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067219, + "balance_loss_mlp": 1.03827536, + "epoch": 0.8963062716429396, + "flos": 517911155712.0, + "grad_norm": 0.05569810882559681, + "language_loss": 0.83101171, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84168386, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.28955078, + "step": 4659, + "time_per_iteration": 2.645664930343628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065748, + "balance_loss_mlp": 1.03768659, + "epoch": 0.8964986533282031, + "flos": 508231148544.0, + "grad_norm": 0.06041289923786583, + "language_loss": 0.8144539, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82511139, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.28076172, + "step": 4660, + "time_per_iteration": 2.699507713317871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066914, + "balance_loss_mlp": 1.03849435, + "epoch": 0.8966910350134667, + "flos": 535819083264.0, + "grad_norm": 0.0649780627361528, + "language_loss": 0.80980611, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82047522, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.28417969, + "step": 4661, + "time_per_iteration": 2.6589531898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066329, + "balance_loss_mlp": 1.03771877, + "epoch": 0.8968834166987303, + "flos": 722909491200.0, + "grad_norm": 0.05835442407396343, + "language_loss": 0.84337735, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85404074, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.28588867, + "step": 4662, + "time_per_iteration": 2.8895277976989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064424, + "balance_loss_mlp": 1.03702998, + "epoch": 0.8970757983839939, + "flos": 681372628992.0, + "grad_norm": 0.049817716882638224, + "language_loss": 0.83679664, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84744084, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.27392578, + "step": 4663, + "time_per_iteration": 2.90505313873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065764, + "balance_loss_mlp": 1.03763032, + "epoch": 0.8972681800692575, + "flos": 612758573568.0, + "grad_norm": 0.07386226147596868, + "language_loss": 0.75563216, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76628977, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.28173828, + "step": 4664, + "time_per_iteration": 2.7474899291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106565, + "balance_loss_mlp": 1.03689647, + "epoch": 0.8974605617545209, + "flos": 572064132096.0, + "grad_norm": 0.05719601329646282, + "language_loss": 0.8326844, + "learning_rate": 2.732078493352913e-05, + "loss": 0.84334087, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.28735352, + "step": 4665, + "time_per_iteration": 2.728703737258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064005, + "balance_loss_mlp": 1.03525186, + "epoch": 0.8976529434397845, + "flos": 520147436544.0, + "grad_norm": 0.055681345294375295, + "language_loss": 0.87152803, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88216805, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.28735352, + "step": 4666, + "time_per_iteration": 2.663864850997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066696, + "balance_loss_mlp": 1.03911066, + "epoch": 0.8978453251250481, + "flos": 471124740096.0, + "grad_norm": 0.08211330217280415, + "language_loss": 0.82403785, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83470482, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.27612305, + "step": 4667, + "time_per_iteration": 2.6915175914764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068319, + "balance_loss_mlp": 1.04042363, + "epoch": 0.8980377068103117, + "flos": 591370301952.0, + "grad_norm": 0.11004700264832698, + "language_loss": 0.81857389, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.82925701, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.27929688, + "step": 4668, + "time_per_iteration": 2.777339458465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069369, + "balance_loss_mlp": 1.04102135, + "epoch": 0.8982300884955752, + "flos": 767287918080.0, + "grad_norm": 0.05198746071964672, + "language_loss": 0.82804859, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83874226, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.28344727, + "step": 4669, + "time_per_iteration": 2.9581100940704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068748, + "balance_loss_mlp": 1.040519, + "epoch": 0.8984224701808388, + "flos": 844189530624.0, + "grad_norm": 0.06146517202916762, + "language_loss": 0.77403522, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78472269, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.2824707, + "step": 4670, + "time_per_iteration": 3.214451789855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106956, + "balance_loss_mlp": 1.04204607, + "epoch": 0.8986148518661023, + "flos": 757303782912.0, + "grad_norm": 0.05628437855404085, + "language_loss": 0.76025915, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.77095473, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.27539062, + "step": 4671, + "time_per_iteration": 3.131769895553589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_mlp": 1.03580141, + "epoch": 0.8988072335513659, + "flos": 562801734144.0, + "grad_norm": 0.05509278789905922, + "language_loss": 0.76818681, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77883279, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.28808594, + "step": 4672, + "time_per_iteration": 2.6790685653686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066065, + "balance_loss_mlp": 1.03874218, + "epoch": 0.8989996152366295, + "flos": 492440228352.0, + "grad_norm": 0.05781833096517719, + "language_loss": 0.86723161, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87789226, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.2734375, + "step": 4673, + "time_per_iteration": 2.5404884815216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066669, + "balance_loss_mlp": 1.03865457, + "epoch": 0.899191996921893, + "flos": 542303935488.0, + "grad_norm": 0.0639009848289485, + "language_loss": 0.75673521, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.76740181, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.28027344, + "step": 4674, + "time_per_iteration": 2.6745707988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066722, + "balance_loss_mlp": 1.03916073, + "epoch": 0.8993843786071566, + "flos": 471081070080.0, + "grad_norm": 0.05958404754424956, + "language_loss": 0.79837209, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80903935, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.27612305, + "step": 4675, + "time_per_iteration": 2.5648069381713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062537, + "balance_loss_mlp": 1.03449929, + "epoch": 0.8995767602924202, + "flos": 526454788608.0, + "grad_norm": 0.058069364250127556, + "language_loss": 0.82527149, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83589685, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.28051758, + "step": 4676, + "time_per_iteration": 2.7322497367858887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_mlp": 1.0367316, + "epoch": 0.8997691419776838, + "flos": 557365916160.0, + "grad_norm": 0.05633383072499603, + "language_loss": 0.84505248, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85570467, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.28491211, + "step": 4677, + "time_per_iteration": 2.6816530227661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069952, + "balance_loss_mlp": 1.04186583, + "epoch": 0.8999615236629472, + "flos": 638722713600.0, + "grad_norm": 0.058231914931515895, + "language_loss": 0.80479538, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81549489, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.28076172, + "step": 4678, + "time_per_iteration": 2.8522558212280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014105, + "balance_loss_mlp": 1.002756, + "epoch": 0.9001539053482108, + "flos": 1430743703040.0, + "grad_norm": 0.00862832057213614, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86798131, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.11328125, + "step": 4679, + "time_per_iteration": 4.799229860305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064336, + "balance_loss_mlp": 1.0357976, + "epoch": 0.9003462870334744, + "flos": 566589026304.0, + "grad_norm": 0.0588723941053944, + "language_loss": 0.80009788, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.81074125, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.28564453, + "step": 4680, + "time_per_iteration": 2.879063606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064946, + "balance_loss_mlp": 1.03712296, + "epoch": 0.900538668718738, + "flos": 538394397696.0, + "grad_norm": 0.0697908395177343, + "language_loss": 0.7863133, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79696274, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.27832031, + "step": 4681, + "time_per_iteration": 2.651343584060669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066763, + "balance_loss_mlp": 1.03867733, + "epoch": 0.9007310504040016, + "flos": 488146576896.0, + "grad_norm": 0.06354903246037491, + "language_loss": 0.8607623, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.87142992, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.28076172, + "step": 4682, + "time_per_iteration": 2.546644449234009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064911, + "balance_loss_mlp": 1.0366112, + "epoch": 0.9009234320892651, + "flos": 652593065472.0, + "grad_norm": 0.06043677066691621, + "language_loss": 0.78744268, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79809177, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.28320312, + "step": 4683, + "time_per_iteration": 2.8712375164031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068774, + "balance_loss_mlp": 1.04054499, + "epoch": 0.9011158137745287, + "flos": 545302651392.0, + "grad_norm": 0.05161746499741545, + "language_loss": 0.85312754, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86381531, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.2824707, + "step": 4684, + "time_per_iteration": 2.6262335777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106342, + "balance_loss_mlp": 1.03588235, + "epoch": 0.9013081954597922, + "flos": 559429079040.0, + "grad_norm": 0.06544642746870727, + "language_loss": 0.82523555, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83586979, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.27563477, + "step": 4685, + "time_per_iteration": 2.6898350715637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066665, + "balance_loss_mlp": 1.03848374, + "epoch": 0.9015005771450558, + "flos": 728330752512.0, + "grad_norm": 0.04796674200603937, + "language_loss": 0.8107928, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82145953, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.28173828, + "step": 4686, + "time_per_iteration": 2.9521684646606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069741, + "balance_loss_mlp": 1.04172671, + "epoch": 0.9016929588303193, + "flos": 517148720640.0, + "grad_norm": 0.05256226767629222, + "language_loss": 0.8077606, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.81845802, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.2800293, + "step": 4687, + "time_per_iteration": 2.84675669670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_mlp": 1.04000878, + "epoch": 0.9018853405155829, + "flos": 622031145984.0, + "grad_norm": 0.04949583001041346, + "language_loss": 0.8596499, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87032723, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.27734375, + "step": 4688, + "time_per_iteration": 2.883434534072876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063716, + "balance_loss_mlp": 1.03551149, + "epoch": 0.9020777222008465, + "flos": 523022496768.0, + "grad_norm": 0.05482141018442068, + "language_loss": 0.77574694, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78638411, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.28198242, + "step": 4689, + "time_per_iteration": 2.601212739944458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063021, + "balance_loss_mlp": 1.03581715, + "epoch": 0.9022701038861101, + "flos": 633419315712.0, + "grad_norm": 0.04920852715445459, + "language_loss": 0.81768286, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82831311, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.27270508, + "step": 4690, + "time_per_iteration": 2.818192481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067502, + "balance_loss_mlp": 1.03984523, + "epoch": 0.9024624855713737, + "flos": 513036951552.0, + "grad_norm": 0.055226262822308456, + "language_loss": 0.84412956, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85480458, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.27685547, + "step": 4691, + "time_per_iteration": 2.6131467819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069251, + "balance_loss_mlp": 1.04195166, + "epoch": 0.9026548672566371, + "flos": 477152695296.0, + "grad_norm": 0.05811897986593017, + "language_loss": 0.86162984, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87232238, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.27368164, + "step": 4692, + "time_per_iteration": 2.6586062908172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106917, + "balance_loss_mlp": 1.04120314, + "epoch": 0.9028472489419007, + "flos": 661701284352.0, + "grad_norm": 0.1426451694737163, + "language_loss": 0.73884237, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74953413, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.28027344, + "step": 4693, + "time_per_iteration": 2.838524580001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067774, + "balance_loss_mlp": 1.0398314, + "epoch": 0.9030396306271643, + "flos": 534332090880.0, + "grad_norm": 0.05783209602584723, + "language_loss": 0.81908751, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.8297652, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.27978516, + "step": 4694, + "time_per_iteration": 2.674063205718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070492, + "balance_loss_mlp": 1.04264498, + "epoch": 0.9032320123124279, + "flos": 800695784448.0, + "grad_norm": 0.05843241181066569, + "language_loss": 0.82359844, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83430338, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.27856445, + "step": 4695, + "time_per_iteration": 2.9802277088165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067562, + "balance_loss_mlp": 1.03997672, + "epoch": 0.9034243939976914, + "flos": 553673166336.0, + "grad_norm": 0.0750205070767636, + "language_loss": 0.76441383, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77508944, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.27636719, + "step": 4696, + "time_per_iteration": 2.6411380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069995, + "balance_loss_mlp": 1.0430541, + "epoch": 0.903616775682955, + "flos": 503656690176.0, + "grad_norm": 0.05696319477889627, + "language_loss": 0.82816821, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83886814, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.26977539, + "step": 4697, + "time_per_iteration": 2.603687286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070428, + "balance_loss_mlp": 1.04215193, + "epoch": 0.9038091573682185, + "flos": 436058173440.0, + "grad_norm": 0.05485008828996457, + "language_loss": 0.78603637, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79674065, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.28295898, + "step": 4698, + "time_per_iteration": 2.638568162918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067091, + "balance_loss_mlp": 1.03898168, + "epoch": 0.9040015390534821, + "flos": 532648659456.0, + "grad_norm": 0.06473187414525833, + "language_loss": 0.81159961, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.82227051, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.28125, + "step": 4699, + "time_per_iteration": 2.6460814476013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062513, + "balance_loss_mlp": 1.034904, + "epoch": 0.9041939207387457, + "flos": 564028268544.0, + "grad_norm": 0.06800466298182273, + "language_loss": 0.80023026, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81085545, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.27661133, + "step": 4700, + "time_per_iteration": 2.7773025035858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071491, + "balance_loss_mlp": 1.04259431, + "epoch": 0.9043863024240092, + "flos": 515257265664.0, + "grad_norm": 0.06969733527966859, + "language_loss": 0.77433765, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78505254, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.2890625, + "step": 4701, + "time_per_iteration": 2.6053738594055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017578, + "balance_loss_mlp": 1.00627708, + "epoch": 0.9045786841092728, + "flos": 1277243043840.0, + "grad_norm": 0.006832227810148578, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73947364, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.11279297, + "step": 4702, + "time_per_iteration": 4.971631288528442 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.04062688, + "epoch": 0.9047710657945364, + "flos": 585569309184.0, + "grad_norm": 0.06435638379504488, + "language_loss": 0.82813382, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83881855, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.27856445, + "step": 4703, + "time_per_iteration": 2.6877286434173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065706, + "balance_loss_mlp": 1.03797805, + "epoch": 0.9049634474798, + "flos": 571655287296.0, + "grad_norm": 0.0636995704600701, + "language_loss": 0.79728121, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80793828, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.27758789, + "step": 4704, + "time_per_iteration": 2.728874444961548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066082, + "balance_loss_mlp": 1.0389502, + "epoch": 0.9051558291650635, + "flos": 572353703424.0, + "grad_norm": 0.07413050035901024, + "language_loss": 0.74390441, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75456524, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.27172852, + "step": 4705, + "time_per_iteration": 2.6595373153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069932, + "balance_loss_mlp": 1.04249048, + "epoch": 0.905348210850327, + "flos": 540280060416.0, + "grad_norm": 0.06136999404791905, + "language_loss": 0.7903558, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80105507, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.2746582, + "step": 4706, + "time_per_iteration": 2.6509690284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072106, + "balance_loss_mlp": 1.04435396, + "epoch": 0.9055405925355906, + "flos": 516128799744.0, + "grad_norm": 0.06082277115431439, + "language_loss": 0.81760788, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82832897, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.27758789, + "step": 4707, + "time_per_iteration": 2.5796375274658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068736, + "balance_loss_mlp": 1.04031706, + "epoch": 0.9057329742208542, + "flos": 914249880576.0, + "grad_norm": 0.40882184585938774, + "language_loss": 0.84702176, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.85770917, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.28393555, + "step": 4708, + "time_per_iteration": 3.1743359565734863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_mlp": 1.03798819, + "epoch": 0.9059253559061178, + "flos": 904884175872.0, + "grad_norm": 0.056239877647307326, + "language_loss": 0.82753253, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.83818728, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.27490234, + "step": 4709, + "time_per_iteration": 3.1511998176574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106885, + "balance_loss_mlp": 1.04059744, + "epoch": 0.9061177375913813, + "flos": 664218372096.0, + "grad_norm": 0.060791344660506334, + "language_loss": 0.77237535, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78306377, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.28222656, + "step": 4710, + "time_per_iteration": 2.859652519226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069474, + "balance_loss_mlp": 1.04150796, + "epoch": 0.9063101192766448, + "flos": 565318821888.0, + "grad_norm": 0.057932581818472106, + "language_loss": 0.82433301, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83502775, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.2800293, + "step": 4711, + "time_per_iteration": 2.746431589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065463, + "balance_loss_mlp": 1.03825986, + "epoch": 0.9065025009619084, + "flos": 727064930304.0, + "grad_norm": 0.05998205753786282, + "language_loss": 0.78935313, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80000776, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.27246094, + "step": 4712, + "time_per_iteration": 2.9277284145355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.03896201, + "epoch": 0.906694882647172, + "flos": 531254799360.0, + "grad_norm": 0.06374980878280882, + "language_loss": 0.80104047, + "learning_rate": 2.265739417041418e-05, + "loss": 0.81170833, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.27856445, + "step": 4713, + "time_per_iteration": 2.678513765335083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066642, + "balance_loss_mlp": 1.03865206, + "epoch": 0.9068872643324356, + "flos": 429563146752.0, + "grad_norm": 0.06604765219045201, + "language_loss": 0.85026371, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.86093009, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.27978516, + "step": 4714, + "time_per_iteration": 2.612898349761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070435, + "balance_loss_mlp": 1.04153872, + "epoch": 0.9070796460176991, + "flos": 588095161344.0, + "grad_norm": 0.06185083154796718, + "language_loss": 0.79640901, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80711341, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.28857422, + "step": 4715, + "time_per_iteration": 2.751021146774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071678, + "balance_loss_mlp": 1.04323435, + "epoch": 0.9072720277029627, + "flos": 571314843648.0, + "grad_norm": 0.0598768727197482, + "language_loss": 0.7539562, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76467299, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.28417969, + "step": 4716, + "time_per_iteration": 2.694836378097534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068538, + "balance_loss_mlp": 1.04131055, + "epoch": 0.9074644093882263, + "flos": 555534097920.0, + "grad_norm": 0.057382736808796596, + "language_loss": 0.88150144, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89218676, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.27270508, + "step": 4717, + "time_per_iteration": 2.6780333518981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065963, + "balance_loss_mlp": 1.03773427, + "epoch": 0.9076567910734898, + "flos": 640701508608.0, + "grad_norm": 0.05537146753326694, + "language_loss": 0.82323325, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83389294, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.2824707, + "step": 4718, + "time_per_iteration": 2.8005425930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071332, + "balance_loss_mlp": 1.04367542, + "epoch": 0.9078491727587533, + "flos": 733635560448.0, + "grad_norm": 0.05081373139618053, + "language_loss": 0.81615859, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82687193, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.27709961, + "step": 4719, + "time_per_iteration": 3.0667171478271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070774, + "balance_loss_mlp": 1.04249716, + "epoch": 0.9080415544440169, + "flos": 654464171520.0, + "grad_norm": 0.05232334449211869, + "language_loss": 0.86633104, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87703872, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.28295898, + "step": 4720, + "time_per_iteration": 2.834167003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066772, + "balance_loss_mlp": 1.03842413, + "epoch": 0.9082339361292805, + "flos": 597180059136.0, + "grad_norm": 0.051475992936928554, + "language_loss": 0.7933374, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80400515, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.28344727, + "step": 4721, + "time_per_iteration": 2.7342042922973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107087, + "balance_loss_mlp": 1.0426892, + "epoch": 0.9084263178145441, + "flos": 504154285056.0, + "grad_norm": 0.05504270564610462, + "language_loss": 0.8449378, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85564649, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.28198242, + "step": 4722, + "time_per_iteration": 2.6053569316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065295, + "balance_loss_mlp": 1.03692365, + "epoch": 0.9086186994998077, + "flos": 549763628544.0, + "grad_norm": 0.05937577628275322, + "language_loss": 0.8047967, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.8154496, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.28393555, + "step": 4723, + "time_per_iteration": 2.7014620304107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106455, + "balance_loss_mlp": 1.03710771, + "epoch": 0.9088110811850711, + "flos": 1133620545024.0, + "grad_norm": 0.05397670601774565, + "language_loss": 0.7509287, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76157427, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.27490234, + "step": 4724, + "time_per_iteration": 3.5497186183929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067508, + "balance_loss_mlp": 1.03966045, + "epoch": 0.9090034628703347, + "flos": 556725726720.0, + "grad_norm": 0.06845603595782776, + "language_loss": 0.77022469, + "learning_rate": 2.155810244111628e-05, + "loss": 0.78089976, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.27856445, + "step": 4725, + "time_per_iteration": 2.6870620250701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066604, + "balance_loss_mlp": 1.03894758, + "epoch": 0.9091958445555983, + "flos": 543697795584.0, + "grad_norm": 0.06585038795867323, + "language_loss": 0.84378177, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85444778, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.27709961, + "step": 4726, + "time_per_iteration": 2.6953237056732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_mlp": 1.0382266, + "epoch": 0.9093882262408619, + "flos": 525858269184.0, + "grad_norm": 0.06459966077589527, + "language_loss": 0.8105191, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82117581, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.27490234, + "step": 4727, + "time_per_iteration": 2.6618425846099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066073, + "balance_loss_mlp": 1.03808236, + "epoch": 0.9095806079261254, + "flos": 548266461696.0, + "grad_norm": 0.06317641405801941, + "language_loss": 0.81712091, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.82778162, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.28027344, + "step": 4728, + "time_per_iteration": 2.6596298217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03681278, + "epoch": 0.909772989611389, + "flos": 572260571136.0, + "grad_norm": 0.058766129071798213, + "language_loss": 0.84501958, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85567194, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.28417969, + "step": 4729, + "time_per_iteration": 2.7634377479553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064416, + "balance_loss_mlp": 1.03675914, + "epoch": 0.9099653712966526, + "flos": 561546086400.0, + "grad_norm": 0.07496188001150708, + "language_loss": 0.79495102, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80559516, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.27685547, + "step": 4730, + "time_per_iteration": 2.6741318702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.04056227, + "epoch": 0.9101577529819161, + "flos": 1093377208320.0, + "grad_norm": 0.06415098680348416, + "language_loss": 0.80079752, + "learning_rate": 2.101848311877069e-05, + "loss": 0.81148523, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.28186035, + "step": 4731, + "time_per_iteration": 3.351849317550659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067585, + "balance_loss_mlp": 1.03904653, + "epoch": 0.9103501346671797, + "flos": 445215854592.0, + "grad_norm": 0.062092517001545014, + "language_loss": 0.81994462, + "learning_rate": 2.092919721190678e-05, + "loss": 0.83062047, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.28491211, + "step": 4732, + "time_per_iteration": 2.600543737411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068188, + "balance_loss_mlp": 1.03950608, + "epoch": 0.9105425163524432, + "flos": 500510997504.0, + "grad_norm": 0.06287463201438012, + "language_loss": 0.77314079, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78382266, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.28662109, + "step": 4733, + "time_per_iteration": 2.6176583766937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065533, + "balance_loss_mlp": 1.03747129, + "epoch": 0.9107348980377068, + "flos": 657206811648.0, + "grad_norm": 0.06599891093057128, + "language_loss": 0.83865237, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.84930772, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.28076172, + "step": 4734, + "time_per_iteration": 2.8651516437530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106646, + "balance_loss_mlp": 1.03863621, + "epoch": 0.9109272797229704, + "flos": 553406916096.0, + "grad_norm": 0.07392297365567703, + "language_loss": 0.84923166, + "learning_rate": 2.066245558029256e-05, + "loss": 0.85989624, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.27856445, + "step": 4735, + "time_per_iteration": 2.628058433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068003, + "balance_loss_mlp": 1.04001248, + "epoch": 0.911119661408234, + "flos": 518757958656.0, + "grad_norm": 0.06239826153412266, + "language_loss": 0.84246588, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85314584, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.2800293, + "step": 4736, + "time_per_iteration": 2.6526265144348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066493, + "balance_loss_mlp": 1.03790689, + "epoch": 0.9113120430934974, + "flos": 554111124480.0, + "grad_norm": 0.05558966408971301, + "language_loss": 0.83016825, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.84083319, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.28564453, + "step": 4737, + "time_per_iteration": 2.6624600887298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064489, + "balance_loss_mlp": 1.03611708, + "epoch": 0.911504424778761, + "flos": 501624050688.0, + "grad_norm": 0.06145383290776928, + "language_loss": 0.8102991, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82094395, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.28369141, + "step": 4738, + "time_per_iteration": 2.7522430419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072434, + "balance_loss_mlp": 1.04391873, + "epoch": 0.9116968064640246, + "flos": 610823448576.0, + "grad_norm": 0.052355603259844785, + "language_loss": 0.82169437, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.8324188, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.28515625, + "step": 4739, + "time_per_iteration": 2.7757930755615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068519, + "balance_loss_mlp": 1.04029012, + "epoch": 0.9118891881492882, + "flos": 572625745920.0, + "grad_norm": 0.06553339401592277, + "language_loss": 0.82452631, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83521152, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.28222656, + "step": 4740, + "time_per_iteration": 2.8027913570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064254, + "balance_loss_mlp": 1.03695512, + "epoch": 0.9120815698345518, + "flos": 635659978752.0, + "grad_norm": 0.055176500742542135, + "language_loss": 0.77731133, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.78795385, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.27319336, + "step": 4741, + "time_per_iteration": 2.8225715160369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066383, + "balance_loss_mlp": 1.03879797, + "epoch": 0.9122739515198153, + "flos": 701988291072.0, + "grad_norm": 0.06489997535399476, + "language_loss": 0.85749066, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86815447, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.27587891, + "step": 4742, + "time_per_iteration": 2.8944971561431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071381, + "balance_loss_mlp": 1.04346228, + "epoch": 0.9124663332050789, + "flos": 524435295744.0, + "grad_norm": 0.06824129090140331, + "language_loss": 0.8727017, + "learning_rate": 1.995933526832239e-05, + "loss": 0.88341552, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.27954102, + "step": 4743, + "time_per_iteration": 2.675344705581665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063959, + "balance_loss_mlp": 1.03596842, + "epoch": 0.9126587148903424, + "flos": 563033078784.0, + "grad_norm": 0.06616707154942209, + "language_loss": 0.82495749, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83559716, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.2800293, + "step": 4744, + "time_per_iteration": 2.6696653366088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063829, + "balance_loss_mlp": 1.03543317, + "epoch": 0.912851096575606, + "flos": 505695121920.0, + "grad_norm": 0.06073199145800207, + "language_loss": 0.80086148, + "learning_rate": 1.978541819374574e-05, + "loss": 0.81149977, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.28393555, + "step": 4745, + "time_per_iteration": 2.578810930252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067604, + "balance_loss_mlp": 1.03970885, + "epoch": 0.9130434782608695, + "flos": 550472219136.0, + "grad_norm": 0.05936218936651509, + "language_loss": 0.82134587, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83202189, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.27905273, + "step": 4746, + "time_per_iteration": 2.6668622493743896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068401, + "balance_loss_mlp": 1.04005289, + "epoch": 0.9132358599461331, + "flos": 468737100288.0, + "grad_norm": 0.05782738716134406, + "language_loss": 0.83086479, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84154886, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.28344727, + "step": 4747, + "time_per_iteration": 2.540804147720337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060155, + "balance_loss_mlp": 1.03197372, + "epoch": 0.9134282416313967, + "flos": 505847890944.0, + "grad_norm": 0.06478397348859542, + "language_loss": 0.79643875, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80704033, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.28173828, + "step": 4748, + "time_per_iteration": 2.716663122177124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106669, + "balance_loss_mlp": 1.03808033, + "epoch": 0.9136206233166603, + "flos": 604540827648.0, + "grad_norm": 0.06062197289258145, + "language_loss": 0.84058869, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.85125566, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.28613281, + "step": 4749, + "time_per_iteration": 2.7612674236297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066673, + "balance_loss_mlp": 1.03830099, + "epoch": 0.9138130050019239, + "flos": 561467510784.0, + "grad_norm": 0.05734158508663424, + "language_loss": 0.83067048, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.84133726, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.28369141, + "step": 4750, + "time_per_iteration": 2.6877682209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065085, + "balance_loss_mlp": 1.03695142, + "epoch": 0.9140053866871873, + "flos": 689811545088.0, + "grad_norm": 0.051169949793753604, + "language_loss": 0.89908755, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.90973842, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.28173828, + "step": 4751, + "time_per_iteration": 2.8582828044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064321, + "balance_loss_mlp": 1.03659272, + "epoch": 0.9141977683724509, + "flos": 550734087168.0, + "grad_norm": 0.050902490738550396, + "language_loss": 0.83958328, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85022652, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.27758789, + "step": 4752, + "time_per_iteration": 2.700676918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067305, + "balance_loss_mlp": 1.0389812, + "epoch": 0.9143901500577145, + "flos": 539831927808.0, + "grad_norm": 0.05842457753383261, + "language_loss": 0.7560339, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76670694, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.28320312, + "step": 4753, + "time_per_iteration": 2.669036626815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064355, + "balance_loss_mlp": 1.03615046, + "epoch": 0.9145825317429781, + "flos": 528512159232.0, + "grad_norm": 0.05637418626712114, + "language_loss": 0.80865467, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81929827, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.2824707, + "step": 4754, + "time_per_iteration": 2.6243560314178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065748, + "balance_loss_mlp": 1.03723359, + "epoch": 0.9147749134282416, + "flos": 514538500608.0, + "grad_norm": 0.06489204553695826, + "language_loss": 0.7878328, + "learning_rate": 1.892702433097776e-05, + "loss": 0.79849029, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.28515625, + "step": 4755, + "time_per_iteration": 2.6430461406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.03900671, + "epoch": 0.9149672951135052, + "flos": 514174735872.0, + "grad_norm": 0.05282624485424685, + "language_loss": 0.85728586, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.86795199, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.27661133, + "step": 4756, + "time_per_iteration": 2.6440606117248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067698, + "balance_loss_mlp": 1.03958797, + "epoch": 0.9151596767987688, + "flos": 576781185024.0, + "grad_norm": 0.054456788510216333, + "language_loss": 0.81087077, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.82154775, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.28125, + "step": 4757, + "time_per_iteration": 2.7387938499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010674, + "balance_loss_mlp": 1.03940928, + "epoch": 0.9153520584840323, + "flos": 619051368960.0, + "grad_norm": 0.05987446215333431, + "language_loss": 0.8248508, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.8355248, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.2800293, + "step": 4758, + "time_per_iteration": 2.7512242794036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066234, + "balance_loss_mlp": 1.0381248, + "epoch": 0.9155444401692959, + "flos": 468687638016.0, + "grad_norm": 0.05345227999499999, + "language_loss": 0.82700217, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83766448, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.28125, + "step": 4759, + "time_per_iteration": 2.608558177947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012165, + "balance_loss_mlp": 1.0007689, + "epoch": 0.9157368218545594, + "flos": 1409931601920.0, + "grad_norm": 0.005458035356382807, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75831234, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.11376953, + "step": 4760, + "time_per_iteration": 4.859896898269653 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012165, + "balance_loss_mlp": 1.00076854, + "epoch": 0.915929203539823, + "flos": 1521195572736.0, + "grad_norm": 0.005460828305516296, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80588222, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.11376953, + "step": 4761, + "time_per_iteration": 4.915686368942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066102, + "balance_loss_mlp": 1.03811181, + "epoch": 0.9161215852250866, + "flos": 535480049664.0, + "grad_norm": 0.06450242723998267, + "language_loss": 0.80469358, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81535459, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.2800293, + "step": 4762, + "time_per_iteration": 2.695746421813965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03860331, + "epoch": 0.9163139669103502, + "flos": 590350381056.0, + "grad_norm": 0.06265162160460012, + "language_loss": 0.80308187, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81374586, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.27807617, + "step": 4763, + "time_per_iteration": 2.71388840675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064606, + "balance_loss_mlp": 1.03549504, + "epoch": 0.9165063485956138, + "flos": 821627159040.0, + "grad_norm": 0.04894944964333379, + "language_loss": 0.84645033, + "learning_rate": 1.817043762598397e-05, + "loss": 0.85709637, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.29125977, + "step": 4764, + "time_per_iteration": 3.070787191390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064261, + "balance_loss_mlp": 1.03619957, + "epoch": 0.9166987302808772, + "flos": 524932890624.0, + "grad_norm": 0.06771662225705596, + "language_loss": 0.81886947, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.82951206, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.28100586, + "step": 4765, + "time_per_iteration": 2.6674678325653076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066384, + "balance_loss_mlp": 1.03829777, + "epoch": 0.9168911119661408, + "flos": 654784266240.0, + "grad_norm": 0.31016783948163307, + "language_loss": 0.84433573, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85499954, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.28125, + "step": 4766, + "time_per_iteration": 2.902374267578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067891, + "balance_loss_mlp": 1.03980517, + "epoch": 0.9170834936514044, + "flos": 491504675328.0, + "grad_norm": 0.06885835194999351, + "language_loss": 0.84648538, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85716426, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.28100586, + "step": 4767, + "time_per_iteration": 2.538447141647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062538, + "balance_loss_mlp": 1.03500056, + "epoch": 0.917275875336668, + "flos": 627756535296.0, + "grad_norm": 0.061905177796194595, + "language_loss": 0.80407935, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81470478, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.27587891, + "step": 4768, + "time_per_iteration": 2.7707064151763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013969, + "balance_loss_mlp": 1.00262046, + "epoch": 0.9174682570219315, + "flos": 1517176935936.0, + "grad_norm": 0.007402133526123785, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79194206, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.11328125, + "step": 4769, + "time_per_iteration": 4.909727096557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065476, + "balance_loss_mlp": 1.03796232, + "epoch": 0.917660638707195, + "flos": 559749173760.0, + "grad_norm": 0.05411764777194339, + "language_loss": 0.85018283, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86083758, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.27490234, + "step": 4770, + "time_per_iteration": 2.6659553050994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_mlp": 1.03958189, + "epoch": 0.9178530203924586, + "flos": 447022941696.0, + "grad_norm": 0.05527017290762028, + "language_loss": 0.8355031, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84617996, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.28149414, + "step": 4771, + "time_per_iteration": 2.547340154647827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064625, + "balance_loss_mlp": 1.03634822, + "epoch": 0.9180454020777222, + "flos": 465734002176.0, + "grad_norm": 0.06001441010446878, + "language_loss": 0.80642879, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81707501, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.28295898, + "step": 4772, + "time_per_iteration": 2.5512609481811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064768, + "balance_loss_mlp": 1.03744531, + "epoch": 0.9182377837629858, + "flos": 596023335936.0, + "grad_norm": 0.06645756172963627, + "language_loss": 0.87070215, + "learning_rate": 1.74290029706784e-05, + "loss": 0.8813498, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.2734375, + "step": 4773, + "time_per_iteration": 2.845562219619751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066198, + "balance_loss_mlp": 1.03706312, + "epoch": 0.9184301654482493, + "flos": 996251249664.0, + "grad_norm": 0.05732398262370566, + "language_loss": 0.82560432, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83626628, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.29125977, + "step": 4774, + "time_per_iteration": 3.335674524307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064966, + "balance_loss_mlp": 1.03685653, + "epoch": 0.9186225471335129, + "flos": 508600705536.0, + "grad_norm": 0.04683275579109834, + "language_loss": 0.84353292, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.8541826, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.28100586, + "step": 4775, + "time_per_iteration": 2.6537563800811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065661, + "balance_loss_mlp": 1.03690755, + "epoch": 0.9188149288187765, + "flos": 940011789312.0, + "grad_norm": 0.05975738892977174, + "language_loss": 0.7872526, + "learning_rate": 1.718522925136551e-05, + "loss": 0.79790926, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.28759766, + "step": 4776, + "time_per_iteration": 3.2783892154693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060807, + "balance_loss_mlp": 1.03284085, + "epoch": 0.91900731050404, + "flos": 583402839552.0, + "grad_norm": 0.05439818019426215, + "language_loss": 0.83903718, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.84964526, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.2800293, + "step": 4777, + "time_per_iteration": 2.7296934127807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063777, + "balance_loss_mlp": 1.03614461, + "epoch": 0.9191996921893035, + "flos": 580941006336.0, + "grad_norm": 0.06874414122365366, + "language_loss": 0.79326808, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80390579, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.27661133, + "step": 4778, + "time_per_iteration": 2.688161849975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106495, + "balance_loss_mlp": 1.03626871, + "epoch": 0.9193920738745671, + "flos": 908566751232.0, + "grad_norm": 0.06089128327905484, + "language_loss": 0.80218065, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81283021, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.28710938, + "step": 4779, + "time_per_iteration": 3.1092312335968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017454, + "balance_loss_mlp": 1.00615335, + "epoch": 0.9195844555598307, + "flos": 1557557074944.0, + "grad_norm": 0.00875582721078654, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80812848, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.11279297, + "step": 4780, + "time_per_iteration": 4.68978214263916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.03633761, + "epoch": 0.9197768372450943, + "flos": 473813535744.0, + "grad_norm": 0.07949185177818381, + "language_loss": 0.78788704, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79853034, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.2800293, + "step": 4781, + "time_per_iteration": 2.57722544670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.03614509, + "epoch": 0.9199692189303579, + "flos": 856622352384.0, + "grad_norm": 0.0600534002616839, + "language_loss": 0.84106392, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85170579, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.28051758, + "step": 4782, + "time_per_iteration": 3.200462818145752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069787, + "balance_loss_mlp": 1.04131949, + "epoch": 0.9201616006156214, + "flos": 504144110592.0, + "grad_norm": 0.06544428358770026, + "language_loss": 0.7733472, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78404504, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.28442383, + "step": 4783, + "time_per_iteration": 2.631211042404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_mlp": 1.03697455, + "epoch": 0.9203539823008849, + "flos": 548503598592.0, + "grad_norm": 0.052623927296135346, + "language_loss": 0.84798336, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.85863233, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.27929688, + "step": 4784, + "time_per_iteration": 2.7127954959869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065976, + "balance_loss_mlp": 1.03824794, + "epoch": 0.9205463639861485, + "flos": 539738795520.0, + "grad_norm": 0.10290216948314322, + "language_loss": 0.82366759, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83432734, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.27734375, + "step": 4785, + "time_per_iteration": 2.635606527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064956, + "balance_loss_mlp": 1.03684688, + "epoch": 0.9207387456714121, + "flos": 799367353344.0, + "grad_norm": 0.0634001865555332, + "language_loss": 0.77785552, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.78850508, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.28125, + "step": 4786, + "time_per_iteration": 3.0849039554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067889, + "balance_loss_mlp": 1.03977942, + "epoch": 0.9209311273566756, + "flos": 502607655936.0, + "grad_norm": 0.05967809159970589, + "language_loss": 0.78313106, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79380995, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.28125, + "step": 4787, + "time_per_iteration": 2.667234182357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_mlp": 1.03841579, + "epoch": 0.9211235090419392, + "flos": 569059623936.0, + "grad_norm": 0.06420045870845124, + "language_loss": 0.82682192, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83748651, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.28051758, + "step": 4788, + "time_per_iteration": 2.7027649879455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066666, + "balance_loss_mlp": 1.03879452, + "epoch": 0.9213158907272028, + "flos": 806205795840.0, + "grad_norm": 0.07333708723290279, + "language_loss": 0.82389617, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.8345629, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.27880859, + "step": 4789, + "time_per_iteration": 3.0040667057037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064674, + "balance_loss_mlp": 1.03623104, + "epoch": 0.9215082724124664, + "flos": 490441084416.0, + "grad_norm": 0.05377371866871187, + "language_loss": 0.75874245, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.76938921, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.28442383, + "step": 4790, + "time_per_iteration": 2.5756027698516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011385, + "balance_loss_mlp": 1.0000844, + "epoch": 0.9217006540977299, + "flos": 1513648539648.0, + "grad_norm": 0.004042788422816454, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.7808165, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.11279297, + "step": 4791, + "time_per_iteration": 4.973644018173218 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067727, + "balance_loss_mlp": 1.03909278, + "epoch": 0.9218930357829934, + "flos": 743471308800.0, + "grad_norm": 0.057601458706605435, + "language_loss": 0.76279974, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77347702, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.28613281, + "step": 4792, + "time_per_iteration": 2.9516866207122803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060038, + "balance_loss_mlp": 1.03223848, + "epoch": 0.922085417468257, + "flos": 452803585536.0, + "grad_norm": 0.05595873441646043, + "language_loss": 0.80235362, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81295407, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.27807617, + "step": 4793, + "time_per_iteration": 2.5641462802886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062665, + "balance_loss_mlp": 1.03479338, + "epoch": 0.9222777991535206, + "flos": 500003228160.0, + "grad_norm": 0.059727249910016274, + "language_loss": 0.85150099, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86212766, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.27880859, + "step": 4794, + "time_per_iteration": 2.593534231185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069605, + "balance_loss_mlp": 1.04101813, + "epoch": 0.9224701808387842, + "flos": 527704644096.0, + "grad_norm": 0.10466348651463832, + "language_loss": 0.78718358, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.79787964, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.28540039, + "step": 4795, + "time_per_iteration": 2.6495256423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066319, + "balance_loss_mlp": 1.03882968, + "epoch": 0.9226625625240477, + "flos": 874272794112.0, + "grad_norm": 0.05332169239610604, + "language_loss": 0.75038373, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.76104683, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.27563477, + "step": 4796, + "time_per_iteration": 3.1437277793884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_mlp": 1.04029274, + "epoch": 0.9228549442093112, + "flos": 502529080320.0, + "grad_norm": 0.06430684376904929, + "language_loss": 0.88128197, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.89195955, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.27490234, + "step": 4797, + "time_per_iteration": 2.5683467388153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106672, + "balance_loss_mlp": 1.03853893, + "epoch": 0.9230473258945748, + "flos": 599705911296.0, + "grad_norm": 0.05378810065548013, + "language_loss": 0.8519541, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86262131, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.28173828, + "step": 4798, + "time_per_iteration": 2.7920963764190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_mlp": 1.04027247, + "epoch": 0.9232397075798384, + "flos": 822168423936.0, + "grad_norm": 0.056516225106437785, + "language_loss": 0.79586041, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.80654448, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.28149414, + "step": 4799, + "time_per_iteration": 3.106332302093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063986, + "balance_loss_mlp": 1.03666353, + "epoch": 0.923432089265102, + "flos": 706719900672.0, + "grad_norm": 0.056508741932020275, + "language_loss": 0.84739339, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.8580333, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.2734375, + "step": 4800, + "time_per_iteration": 2.9519155025482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067591, + "balance_loss_mlp": 1.03986311, + "epoch": 0.9236244709503655, + "flos": 701554715136.0, + "grad_norm": 0.066698066809805, + "language_loss": 0.76543391, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77610976, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.27758789, + "step": 4801, + "time_per_iteration": 2.848271131515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067135, + "balance_loss_mlp": 1.03895378, + "epoch": 0.9238168526356291, + "flos": 514780019712.0, + "grad_norm": 0.06767841567088752, + "language_loss": 0.83995795, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85062933, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.28198242, + "step": 4802, + "time_per_iteration": 2.6541097164154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067162, + "balance_loss_mlp": 1.03881443, + "epoch": 0.9240092343208927, + "flos": 491789864448.0, + "grad_norm": 0.05859515366880438, + "language_loss": 0.81224668, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.8229183, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.28369141, + "step": 4803, + "time_per_iteration": 2.564298391342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.03745484, + "epoch": 0.9242016160061562, + "flos": 646915728384.0, + "grad_norm": 0.0605331222731167, + "language_loss": 0.73584902, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74649346, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.27026367, + "step": 4804, + "time_per_iteration": 2.9267265796661377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065448, + "balance_loss_mlp": 1.0383395, + "epoch": 0.9243939976914197, + "flos": 729094597632.0, + "grad_norm": 0.06139671428886114, + "language_loss": 0.79408431, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80473882, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.27148438, + "step": 4805, + "time_per_iteration": 3.0169341564178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068038, + "balance_loss_mlp": 1.0397377, + "epoch": 0.9245863793766833, + "flos": 452006244864.0, + "grad_norm": 0.07511640502598389, + "language_loss": 0.90425861, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91493905, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.28320312, + "step": 4806, + "time_per_iteration": 2.5978922843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106683, + "balance_loss_mlp": 1.03974569, + "epoch": 0.9247787610619469, + "flos": 754697945088.0, + "grad_norm": 0.06258160431282603, + "language_loss": 0.76767433, + "learning_rate": 1.476516966469732e-05, + "loss": 0.7783426, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.27124023, + "step": 4807, + "time_per_iteration": 2.956561803817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069239, + "balance_loss_mlp": 1.04010475, + "epoch": 0.9249711427472105, + "flos": 561640628736.0, + "grad_norm": 0.055778501240304965, + "language_loss": 0.84958422, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86027658, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.29125977, + "step": 4808, + "time_per_iteration": 2.7347190380096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066354, + "balance_loss_mlp": 1.03819704, + "epoch": 0.9251635244324741, + "flos": 526430057472.0, + "grad_norm": 0.06318769230203738, + "language_loss": 0.85102391, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86168742, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.28173828, + "step": 4809, + "time_per_iteration": 2.6991071701049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065383, + "balance_loss_mlp": 1.03724957, + "epoch": 0.9253559061177375, + "flos": 610982009856.0, + "grad_norm": 0.10096377008479462, + "language_loss": 0.7862674, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.79692125, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.28149414, + "step": 4810, + "time_per_iteration": 2.8498053550720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101132, + "balance_loss_mlp": 1.00006664, + "epoch": 0.9255482878030011, + "flos": 1550461146624.0, + "grad_norm": 0.00391096047645047, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77936709, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.11230469, + "step": 4811, + "time_per_iteration": 4.764047861099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106661, + "balance_loss_mlp": 1.03804743, + "epoch": 0.9257406694882647, + "flos": 766008949248.0, + "grad_norm": 0.06895968648743506, + "language_loss": 0.80879593, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.81946206, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.28588867, + "step": 4812, + "time_per_iteration": 3.061112880706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063531, + "balance_loss_mlp": 1.03599334, + "epoch": 0.9259330511735283, + "flos": 497748008448.0, + "grad_norm": 0.058143604426202734, + "language_loss": 0.83249688, + "learning_rate": 1.431765421986686e-05, + "loss": 0.8431322, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.27563477, + "step": 4813, + "time_per_iteration": 2.643853187561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067291, + "balance_loss_mlp": 1.03941989, + "epoch": 0.9261254328587919, + "flos": 626589637632.0, + "grad_norm": 0.061231728055144215, + "language_loss": 0.79372674, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80439967, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.27880859, + "step": 4814, + "time_per_iteration": 2.793656349182129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067503, + "balance_loss_mlp": 1.03970385, + "epoch": 0.9263178145440554, + "flos": 597105865728.0, + "grad_norm": 0.05834470087321101, + "language_loss": 0.85063499, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86131001, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.27856445, + "step": 4815, + "time_per_iteration": 2.7759041786193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066213, + "balance_loss_mlp": 1.03779304, + "epoch": 0.926510196229319, + "flos": 388350761472.0, + "grad_norm": 0.07585655013860047, + "language_loss": 0.83582151, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84648359, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.28393555, + "step": 4816, + "time_per_iteration": 2.5199952125549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066914, + "balance_loss_mlp": 1.03937626, + "epoch": 0.9267025779145825, + "flos": 545533996032.0, + "grad_norm": 0.07628608002460177, + "language_loss": 0.8428371, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85350621, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.27563477, + "step": 4817, + "time_per_iteration": 2.6322243213653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069066, + "balance_loss_mlp": 1.04109931, + "epoch": 0.9268949595998461, + "flos": 499540538880.0, + "grad_norm": 0.056009628743491614, + "language_loss": 0.81786913, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.82855976, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.27978516, + "step": 4818, + "time_per_iteration": 2.652061700820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064068, + "balance_loss_mlp": 1.03619719, + "epoch": 0.9270873412851096, + "flos": 432601150464.0, + "grad_norm": 0.060612630955757626, + "language_loss": 0.82984769, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.84048837, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.27880859, + "step": 4819, + "time_per_iteration": 2.693363666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063444, + "balance_loss_mlp": 1.03633547, + "epoch": 0.9272797229703732, + "flos": 466512403968.0, + "grad_norm": 0.05620589668035287, + "language_loss": 0.85918975, + "learning_rate": 1.380413270847164e-05, + "loss": 0.86982417, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.27148438, + "step": 4820, + "time_per_iteration": 2.6197123527526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063733, + "balance_loss_mlp": 1.03567159, + "epoch": 0.9274721046556368, + "flos": 704486439936.0, + "grad_norm": 0.05510862949217126, + "language_loss": 0.78793794, + "learning_rate": 1.373152729763938e-05, + "loss": 0.79857528, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.28076172, + "step": 4821, + "time_per_iteration": 3.0157546997070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009196, + "balance_loss_mlp": 0.99789476, + "epoch": 0.9276644863409004, + "flos": 1401486893568.0, + "grad_norm": 0.004102053243796757, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83389658, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.11279297, + "step": 4822, + "time_per_iteration": 4.890833854675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065915, + "balance_loss_mlp": 1.03830612, + "epoch": 0.927856868026164, + "flos": 741370268160.0, + "grad_norm": 0.06118264568584961, + "language_loss": 0.80120695, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.8118661, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.27636719, + "step": 4823, + "time_per_iteration": 3.035834550857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.04052949, + "epoch": 0.9280492497114274, + "flos": 412000045056.0, + "grad_norm": 0.06263010480012486, + "language_loss": 0.7379303, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.74861383, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.27832031, + "step": 4824, + "time_per_iteration": 2.461954355239868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067965, + "balance_loss_mlp": 1.03964114, + "epoch": 0.928241631396691, + "flos": 646215902208.0, + "grad_norm": 0.0608824957877414, + "language_loss": 0.84066081, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.85134053, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.28320312, + "step": 4825, + "time_per_iteration": 2.787639617919922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064738, + "balance_loss_mlp": 1.03674757, + "epoch": 0.9284340130819546, + "flos": 696537916416.0, + "grad_norm": 0.05281576196945734, + "language_loss": 0.80929303, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81994045, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.27978516, + "step": 4826, + "time_per_iteration": 2.9434425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106758, + "balance_loss_mlp": 1.03951824, + "epoch": 0.9286263947672182, + "flos": 758780600832.0, + "grad_norm": 0.05077416980052692, + "language_loss": 0.8357712, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84644705, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.28076172, + "step": 4827, + "time_per_iteration": 3.032938241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_mlp": 1.03722429, + "epoch": 0.9288187764524817, + "flos": 672495754752.0, + "grad_norm": 0.05731069309896637, + "language_loss": 0.80015826, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81081235, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.28198242, + "step": 4828, + "time_per_iteration": 2.917309522628784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064578, + "balance_loss_mlp": 1.03715968, + "epoch": 0.9290111581377453, + "flos": 500220016128.0, + "grad_norm": 0.059623319858346985, + "language_loss": 0.83761036, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84825623, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.27490234, + "step": 4829, + "time_per_iteration": 2.6183080673217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007339, + "balance_loss_mlp": 0.99599022, + "epoch": 0.9292035398230089, + "flos": 1562773132800.0, + "grad_norm": 0.004808076693014282, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.73129404, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.11328125, + "step": 4830, + "time_per_iteration": 4.92920446395874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100734, + "balance_loss_mlp": 0.99599123, + "epoch": 0.9293959215082724, + "flos": 1517828709888.0, + "grad_norm": 0.004800596533171945, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.80519176, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.11328125, + "step": 4831, + "time_per_iteration": 4.897861003875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068474, + "balance_loss_mlp": 1.04098451, + "epoch": 0.929588303193536, + "flos": 557572529664.0, + "grad_norm": 0.08301907154555206, + "language_loss": 0.84292293, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.85360765, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.27514648, + "step": 4832, + "time_per_iteration": 2.660130023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066483, + "balance_loss_mlp": 1.03892231, + "epoch": 0.9297806848787995, + "flos": 478338531840.0, + "grad_norm": 0.07295774266444127, + "language_loss": 0.79771066, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.80837542, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.27612305, + "step": 4833, + "time_per_iteration": 2.603111982345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064962, + "balance_loss_mlp": 1.03737652, + "epoch": 0.9299730665640631, + "flos": 564259613184.0, + "grad_norm": 0.05026331549954224, + "language_loss": 0.80081427, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81146395, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.27612305, + "step": 4834, + "time_per_iteration": 2.769982099533081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069532, + "balance_loss_mlp": 1.04094601, + "epoch": 0.9301654482493267, + "flos": 559883003904.0, + "grad_norm": 0.06905746243332382, + "language_loss": 0.82341313, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83410847, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.28564453, + "step": 4835, + "time_per_iteration": 2.8149759769439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009283, + "balance_loss_mlp": 0.9979341, + "epoch": 0.9303578299345903, + "flos": 1519251683328.0, + "grad_norm": 0.00409976409094526, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77861726, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.11328125, + "step": 4836, + "time_per_iteration": 4.970911979675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063815, + "balance_loss_mlp": 1.03622973, + "epoch": 0.9305502116198537, + "flos": 530589878784.0, + "grad_norm": 0.06739289017975023, + "language_loss": 0.83095217, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.84159029, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.27612305, + "step": 4837, + "time_per_iteration": 2.6105446815490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063157, + "balance_loss_mlp": 1.03542876, + "epoch": 0.9307425933051173, + "flos": 474660338688.0, + "grad_norm": 0.08448682706583123, + "language_loss": 0.81358898, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82422054, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.27758789, + "step": 4838, + "time_per_iteration": 2.5410094261169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066167, + "balance_loss_mlp": 1.03817677, + "epoch": 0.9309349749903809, + "flos": 584600260608.0, + "grad_norm": 0.056392712397074544, + "language_loss": 0.8662045, + "learning_rate": 1.245693929549213e-05, + "loss": 0.8768661, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.27978516, + "step": 4839, + "time_per_iteration": 2.7685422897338867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068427, + "balance_loss_mlp": 1.04088926, + "epoch": 0.9311273566756445, + "flos": 861298707456.0, + "grad_norm": 0.05211302264948049, + "language_loss": 0.76617467, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77685893, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.27587891, + "step": 4840, + "time_per_iteration": 3.0811336040496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063666, + "balance_loss_mlp": 1.03512692, + "epoch": 0.9313197383609081, + "flos": 547828503552.0, + "grad_norm": 0.05632257898904223, + "language_loss": 0.82257402, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83321071, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.28564453, + "step": 4841, + "time_per_iteration": 2.6456518173217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_mlp": 1.03546572, + "epoch": 0.9315121200461716, + "flos": 468520312320.0, + "grad_norm": 0.06835542650299899, + "language_loss": 0.81116635, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.82179779, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.27709961, + "step": 4842, + "time_per_iteration": 2.51662015914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_mlp": 1.03777909, + "epoch": 0.9317045017314352, + "flos": 417435863040.0, + "grad_norm": 0.08457749327119382, + "language_loss": 0.77490675, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78556347, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.27929688, + "step": 4843, + "time_per_iteration": 2.5226502418518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067033, + "balance_loss_mlp": 1.03863692, + "epoch": 0.9318968834166987, + "flos": 540207277056.0, + "grad_norm": 0.0629878065349501, + "language_loss": 0.77101374, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78168404, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.28393555, + "step": 4844, + "time_per_iteration": 2.7928385734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066678, + "balance_loss_mlp": 1.03959417, + "epoch": 0.9320892651019623, + "flos": 521077197312.0, + "grad_norm": 0.07893659024382914, + "language_loss": 0.80772531, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81839204, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.27148438, + "step": 4845, + "time_per_iteration": 2.6249451637268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066098, + "balance_loss_mlp": 1.03848934, + "epoch": 0.9322816467872258, + "flos": 581779044864.0, + "grad_norm": 0.073776710078104, + "language_loss": 0.80633116, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.8169921, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.27636719, + "step": 4846, + "time_per_iteration": 2.76720929145813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066342, + "balance_loss_mlp": 1.03830385, + "epoch": 0.9324740284724894, + "flos": 484484350464.0, + "grad_norm": 0.0546236987081132, + "language_loss": 0.8169229, + "learning_rate": 1.191013150742537e-05, + "loss": 0.82758635, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.28051758, + "step": 4847, + "time_per_iteration": 2.705033540725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.03544426, + "epoch": 0.932666410157753, + "flos": 732227143680.0, + "grad_norm": 0.05343238410156727, + "language_loss": 0.82291055, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.83355719, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.29162598, + "step": 4848, + "time_per_iteration": 3.050323009490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062876, + "balance_loss_mlp": 1.03505254, + "epoch": 0.9328587918430166, + "flos": 965127716352.0, + "grad_norm": 0.05192892613374428, + "language_loss": 0.78535151, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.79598027, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.27832031, + "step": 4849, + "time_per_iteration": 3.2585856914520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065264, + "balance_loss_mlp": 1.03708267, + "epoch": 0.9330511735282802, + "flos": 614270297088.0, + "grad_norm": 0.06344871555323196, + "language_loss": 0.80523133, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81588399, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.28173828, + "step": 4850, + "time_per_iteration": 2.7109756469726562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067141, + "balance_loss_mlp": 1.03857827, + "epoch": 0.9332435552135436, + "flos": 558823795200.0, + "grad_norm": 0.056851249126662895, + "language_loss": 0.85547817, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.86614954, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.28540039, + "step": 4851, + "time_per_iteration": 2.687770128250122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063314, + "balance_loss_mlp": 1.0353713, + "epoch": 0.9334359368988072, + "flos": 515281996800.0, + "grad_norm": 0.052446357449260315, + "language_loss": 0.81798899, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82862216, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.27954102, + "step": 4852, + "time_per_iteration": 2.627295970916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064942, + "balance_loss_mlp": 1.03778601, + "epoch": 0.9336283185840708, + "flos": 539527799808.0, + "grad_norm": 0.048937576786060644, + "language_loss": 0.82746959, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83811903, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.2722168, + "step": 4853, + "time_per_iteration": 2.7801096439361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007035, + "balance_loss_mlp": 0.99568605, + "epoch": 0.9338207002693344, + "flos": 1562003495424.0, + "grad_norm": 0.003590229468680035, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79462051, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.11328125, + "step": 4854, + "time_per_iteration": 4.905395746231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_mlp": 1.03570247, + "epoch": 0.9340130819545979, + "flos": 644951490048.0, + "grad_norm": 0.10383389048571988, + "language_loss": 0.81319606, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82383037, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.27758789, + "step": 4855, + "time_per_iteration": 2.9259116649627686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063971, + "balance_loss_mlp": 1.03652906, + "epoch": 0.9342054636398615, + "flos": 503175062016.0, + "grad_norm": 0.058422853939071095, + "language_loss": 0.76883429, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.77947402, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.27429199, + "step": 4856, + "time_per_iteration": 2.6962661743164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066201, + "balance_loss_mlp": 1.03816259, + "epoch": 0.934397845325125, + "flos": 592724874240.0, + "grad_norm": 0.04989142835749334, + "language_loss": 0.84225118, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85291314, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.28051758, + "step": 4857, + "time_per_iteration": 2.89798903465271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065399, + "balance_loss_mlp": 1.03750432, + "epoch": 0.9345902270103886, + "flos": 499643845632.0, + "grad_norm": 0.059259414346205984, + "language_loss": 0.80253309, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81318712, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.27905273, + "step": 4858, + "time_per_iteration": 2.6194543838500977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007919, + "balance_loss_mlp": 0.99657035, + "epoch": 0.9347826086956522, + "flos": 1519563165696.0, + "grad_norm": 0.0037344003597183113, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.76995641, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.11328125, + "step": 4859, + "time_per_iteration": 4.687377452850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063938, + "balance_loss_mlp": 1.0351609, + "epoch": 0.9349749903809157, + "flos": 504273558528.0, + "grad_norm": 0.05342173918778132, + "language_loss": 0.80887705, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.81951642, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.28759766, + "step": 4860, + "time_per_iteration": 2.833953857421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_mlp": 1.03792143, + "epoch": 0.9351673720661793, + "flos": 568636222464.0, + "grad_norm": 0.07222784760329864, + "language_loss": 0.78340459, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79406255, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.27880859, + "step": 4861, + "time_per_iteration": 2.6537110805511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106113, + "balance_loss_mlp": 1.0336163, + "epoch": 0.9353597537514429, + "flos": 544342367232.0, + "grad_norm": 0.06260642991207148, + "language_loss": 0.86519629, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.87580758, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.27539062, + "step": 4862, + "time_per_iteration": 2.633547067642212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063681, + "balance_loss_mlp": 1.03521395, + "epoch": 0.9355521354367065, + "flos": 518743401984.0, + "grad_norm": 0.0581520502605348, + "language_loss": 0.84730381, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85794055, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.28442383, + "step": 4863, + "time_per_iteration": 2.7387821674346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064, + "balance_loss_mlp": 1.0354135, + "epoch": 0.93574451712197, + "flos": 446087388672.0, + "grad_norm": 0.06553565178892076, + "language_loss": 0.78680766, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79744768, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.28564453, + "step": 4864, + "time_per_iteration": 2.493662118911743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063373, + "balance_loss_mlp": 1.03504932, + "epoch": 0.9359368988072335, + "flos": 480273656832.0, + "grad_norm": 0.06767871177547606, + "language_loss": 0.7636739, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77430761, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.28295898, + "step": 4865, + "time_per_iteration": 2.520573854446411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062064, + "balance_loss_mlp": 1.03369164, + "epoch": 0.9361292804924971, + "flos": 496876474368.0, + "grad_norm": 0.06249909871095247, + "language_loss": 0.84898299, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.85960364, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.28417969, + "step": 4866, + "time_per_iteration": 2.6384427547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059595, + "balance_loss_mlp": 1.03227186, + "epoch": 0.9363216621777607, + "flos": 617830626816.0, + "grad_norm": 0.05655870704984646, + "language_loss": 0.84264755, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85324347, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.27368164, + "step": 4867, + "time_per_iteration": 2.7363386154174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005928, + "balance_loss_mlp": 0.99457914, + "epoch": 0.9365140438630243, + "flos": 1415169570816.0, + "grad_norm": 0.004522338300868298, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80209267, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.11328125, + "step": 4868, + "time_per_iteration": 4.889655828475952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066052, + "balance_loss_mlp": 1.03765643, + "epoch": 0.9367064255482878, + "flos": 590217960960.0, + "grad_norm": 0.06477461898432092, + "language_loss": 0.81238163, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82304209, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.28393555, + "step": 4869, + "time_per_iteration": 2.7326934337615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063402, + "balance_loss_mlp": 1.03545952, + "epoch": 0.9368988072335513, + "flos": 526384977408.0, + "grad_norm": 0.06786641398202575, + "language_loss": 0.82115895, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.83179295, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.27978516, + "step": 4870, + "time_per_iteration": 2.7047648429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064712, + "balance_loss_mlp": 1.03648376, + "epoch": 0.9370911889188149, + "flos": 742880581632.0, + "grad_norm": 0.05856438164101436, + "language_loss": 0.78791976, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79856682, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.28222656, + "step": 4871, + "time_per_iteration": 2.939244270324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_mlp": 1.03388393, + "epoch": 0.9372835706040785, + "flos": 545779897344.0, + "grad_norm": 0.051186143222515454, + "language_loss": 0.78588909, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79650283, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.27539062, + "step": 4872, + "time_per_iteration": 2.620211362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062731, + "balance_loss_mlp": 1.03459811, + "epoch": 0.9374759522893421, + "flos": 491367873024.0, + "grad_norm": 0.057187231677836646, + "language_loss": 0.81548411, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82611144, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.28149414, + "step": 4873, + "time_per_iteration": 2.6956076622009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064273, + "balance_loss_mlp": 1.03685474, + "epoch": 0.9376683339746056, + "flos": 578144521728.0, + "grad_norm": 0.07694020765815146, + "language_loss": 0.82509339, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.8357361, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.2746582, + "step": 4874, + "time_per_iteration": 2.6782383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063517, + "balance_loss_mlp": 1.0343821, + "epoch": 0.9378607156598692, + "flos": 506039947776.0, + "grad_norm": 0.06987708910160345, + "language_loss": 0.80854172, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81917691, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.29125977, + "step": 4875, + "time_per_iteration": 2.655191659927368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062623, + "balance_loss_mlp": 1.03413165, + "epoch": 0.9380530973451328, + "flos": 519753148416.0, + "grad_norm": 0.06331858245443533, + "language_loss": 0.77412724, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.7847535, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.28466797, + "step": 4876, + "time_per_iteration": 2.658334255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063725, + "balance_loss_mlp": 1.03606796, + "epoch": 0.9382454790303963, + "flos": 557533241856.0, + "grad_norm": 0.061367616716062376, + "language_loss": 0.8458181, + "learning_rate": 9.967720642029999e-06, + "loss": 0.85645533, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.27661133, + "step": 4877, + "time_per_iteration": 2.6817362308502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065641, + "balance_loss_mlp": 1.03722119, + "epoch": 0.9384378607156598, + "flos": 695149848576.0, + "grad_norm": 0.05571055907247939, + "language_loss": 0.8157208, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82637721, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.28417969, + "step": 4878, + "time_per_iteration": 2.924126386642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063872, + "balance_loss_mlp": 1.03533387, + "epoch": 0.9386302424009234, + "flos": 554480681472.0, + "grad_norm": 0.06557840572929766, + "language_loss": 0.80646306, + "learning_rate": 9.844307158203058e-06, + "loss": 0.81710184, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.28540039, + "step": 4879, + "time_per_iteration": 2.68676495552063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062372, + "balance_loss_mlp": 1.0342859, + "epoch": 0.938822624086187, + "flos": 566711271936.0, + "grad_norm": 0.05994430498236734, + "language_loss": 0.79781514, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80843884, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.28100586, + "step": 4880, + "time_per_iteration": 2.6532607078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063834, + "balance_loss_mlp": 1.03615308, + "epoch": 0.9390150057714506, + "flos": 417367461888.0, + "grad_norm": 0.05571972818867672, + "language_loss": 0.80477625, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81541461, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.27709961, + "step": 4881, + "time_per_iteration": 2.5952963829040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060824, + "balance_loss_mlp": 1.03240418, + "epoch": 0.9392073874567142, + "flos": 1553281256448.0, + "grad_norm": 0.057309564525933866, + "language_loss": 0.76139069, + "learning_rate": 9.660614206766394e-06, + "loss": 0.77199888, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.28417969, + "step": 4882, + "time_per_iteration": 3.7136471271514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065968, + "balance_loss_mlp": 1.03766727, + "epoch": 0.9393997691419776, + "flos": 652238065152.0, + "grad_norm": 0.05812086572072492, + "language_loss": 0.78156579, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79222548, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.28295898, + "step": 4883, + "time_per_iteration": 2.781167984008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005014, + "balance_loss_mlp": 0.99371332, + "epoch": 0.9395921508272412, + "flos": 1552480639488.0, + "grad_norm": 0.004750950893681344, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79175687, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.11279297, + "step": 4884, + "time_per_iteration": 4.841533899307251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065675, + "balance_loss_mlp": 1.0382328, + "epoch": 0.9397845325125048, + "flos": 497881838592.0, + "grad_norm": 0.05445625931005124, + "language_loss": 0.78697509, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79763186, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.27441406, + "step": 4885, + "time_per_iteration": 2.627277135848999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010635, + "balance_loss_mlp": 1.03550982, + "epoch": 0.9399769141977684, + "flos": 498348910080.0, + "grad_norm": 0.08093151667786662, + "language_loss": 0.83377492, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84440994, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.2800293, + "step": 4886, + "time_per_iteration": 2.5997188091278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_mlp": 0.99370664, + "epoch": 0.9401692958830319, + "flos": 1401709473792.0, + "grad_norm": 0.004746445275638401, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80337197, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.11279297, + "step": 4887, + "time_per_iteration": 4.797895431518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106761, + "balance_loss_mlp": 1.03883338, + "epoch": 0.9403616775682955, + "flos": 539852276736.0, + "grad_norm": 0.04356257563048395, + "language_loss": 0.84935153, + "learning_rate": 9.298368837495575e-06, + "loss": 0.86002755, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.28808594, + "step": 4888, + "time_per_iteration": 2.6986100673675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01004991, + "balance_loss_mlp": 0.99369013, + "epoch": 0.9405540592535591, + "flos": 1321340663808.0, + "grad_norm": 0.004744801636887555, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76174426, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.11279297, + "step": 4889, + "time_per_iteration": 4.883483648300171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066327, + "balance_loss_mlp": 1.03843164, + "epoch": 0.9407464409388226, + "flos": 572097627648.0, + "grad_norm": 0.09796723375615995, + "language_loss": 0.82906234, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83972561, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.27929688, + "step": 4890, + "time_per_iteration": 2.6586780548095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064949, + "balance_loss_mlp": 1.03652978, + "epoch": 0.9409388226240862, + "flos": 510994137600.0, + "grad_norm": 0.050284661991451346, + "language_loss": 0.76816261, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77881205, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.28393555, + "step": 4891, + "time_per_iteration": 2.7174863815307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006369, + "balance_loss_mlp": 0.99506766, + "epoch": 0.9411312043093497, + "flos": 1569060135936.0, + "grad_norm": 0.0038787350067584375, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81248254, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.11279297, + "step": 4892, + "time_per_iteration": 4.926965236663818 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.03666997, + "epoch": 0.9413235859946133, + "flos": 569197836288.0, + "grad_norm": 0.07544734388954553, + "language_loss": 0.78440136, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79504317, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.27514648, + "step": 4893, + "time_per_iteration": 2.810511827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.04094052, + "epoch": 0.9415159676798769, + "flos": 781567114752.0, + "grad_norm": 0.062357889378770605, + "language_loss": 0.80282962, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81352556, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.28662109, + "step": 4894, + "time_per_iteration": 3.0641191005706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_mlp": 1.0392108, + "epoch": 0.9417083493651405, + "flos": 848960428032.0, + "grad_norm": 0.05312406489604803, + "language_loss": 0.79909003, + "learning_rate": 8.884417661086331e-06, + "loss": 0.80977184, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.28955078, + "step": 4895, + "time_per_iteration": 3.1742238998413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064713, + "balance_loss_mlp": 1.0362221, + "epoch": 0.941900731050404, + "flos": 529054834176.0, + "grad_norm": 0.053131997206903085, + "language_loss": 0.85986912, + "learning_rate": 8.826044268024025e-06, + "loss": 0.87051624, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.28491211, + "step": 4896, + "time_per_iteration": 2.68365740776062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066337, + "balance_loss_mlp": 1.03803682, + "epoch": 0.9420931127356675, + "flos": 556799920128.0, + "grad_norm": 0.05603051952986068, + "language_loss": 0.8033452, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81400859, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.28320312, + "step": 4897, + "time_per_iteration": 2.789910078048706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064812, + "balance_loss_mlp": 1.03653598, + "epoch": 0.9422854944209311, + "flos": 652233682944.0, + "grad_norm": 0.06641212670378875, + "language_loss": 0.86446559, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87511379, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.28295898, + "step": 4898, + "time_per_iteration": 2.854471206665039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062929, + "balance_loss_mlp": 1.03436613, + "epoch": 0.9424778761061947, + "flos": 553417090560.0, + "grad_norm": 0.05214304226628579, + "language_loss": 0.84051895, + "learning_rate": 8.65206832296478e-06, + "loss": 0.85114825, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.28588867, + "step": 4899, + "time_per_iteration": 2.7259373664855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062683, + "balance_loss_mlp": 1.03505075, + "epoch": 0.9426702577914583, + "flos": 588287218176.0, + "grad_norm": 0.067020017244683, + "language_loss": 0.79881752, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80944431, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.27685547, + "step": 4900, + "time_per_iteration": 2.6749682426452637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071741, + "balance_loss_mlp": 1.04353571, + "epoch": 0.9428626394767218, + "flos": 616329077760.0, + "grad_norm": 0.06073739740547212, + "language_loss": 0.7828182, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79353559, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.28222656, + "step": 4901, + "time_per_iteration": 2.8038489818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063214, + "balance_loss_mlp": 1.03515244, + "epoch": 0.9430550211619854, + "flos": 610129414656.0, + "grad_norm": 0.06189608765953851, + "language_loss": 0.81724429, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82787645, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.28125, + "step": 4902, + "time_per_iteration": 2.716689109802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066777, + "balance_loss_mlp": 1.03890562, + "epoch": 0.943247402847249, + "flos": 565726256640.0, + "grad_norm": 0.06175079679005683, + "language_loss": 0.78484106, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79550886, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.27905273, + "step": 4903, + "time_per_iteration": 2.7039542198181152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062834, + "balance_loss_mlp": 1.03515339, + "epoch": 0.9434397845325125, + "flos": 526779265536.0, + "grad_norm": 0.06498343136748494, + "language_loss": 0.81188715, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82251555, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.27734375, + "step": 4904, + "time_per_iteration": 2.6787548065185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064293, + "balance_loss_mlp": 1.03608775, + "epoch": 0.943632166217776, + "flos": 593167214592.0, + "grad_norm": 0.054946869384208306, + "language_loss": 0.82257801, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83322096, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.2824707, + "step": 4905, + "time_per_iteration": 2.7595133781433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067293, + "balance_loss_mlp": 1.03875446, + "epoch": 0.9438245479030396, + "flos": 572468594688.0, + "grad_norm": 0.05251720800952187, + "language_loss": 0.85584581, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86651874, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.28515625, + "step": 4906, + "time_per_iteration": 2.860182523727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_mlp": 1.03592801, + "epoch": 0.9440169295883032, + "flos": 488018539008.0, + "grad_norm": 0.06069717631166294, + "language_loss": 0.81664246, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82728094, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.27905273, + "step": 4907, + "time_per_iteration": 2.527818202972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065327, + "balance_loss_mlp": 1.03717005, + "epoch": 0.9442093112735668, + "flos": 731399279616.0, + "grad_norm": 0.05466017438310119, + "language_loss": 0.7315473, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74220055, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.28173828, + "step": 4908, + "time_per_iteration": 2.998216390609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067308, + "balance_loss_mlp": 1.03867412, + "epoch": 0.9444016929588304, + "flos": 570763404288.0, + "grad_norm": 0.0622325125694981, + "language_loss": 0.82240564, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83307874, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.28637695, + "step": 4909, + "time_per_iteration": 2.6756813526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106833, + "balance_loss_mlp": 1.03957677, + "epoch": 0.9445940746440938, + "flos": 509038663680.0, + "grad_norm": 0.06423421312294773, + "language_loss": 0.85805643, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86873972, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.28735352, + "step": 4910, + "time_per_iteration": 2.6203274726867676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065056, + "balance_loss_mlp": 1.03773332, + "epoch": 0.9447864563293574, + "flos": 624247077888.0, + "grad_norm": 0.0551617966572636, + "language_loss": 0.80864727, + "learning_rate": 7.97333876382028e-06, + "loss": 0.81929785, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.2734375, + "step": 4911, + "time_per_iteration": 2.824267864227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065097, + "balance_loss_mlp": 1.03674865, + "epoch": 0.944978838014621, + "flos": 505011262464.0, + "grad_norm": 0.05688652138029108, + "language_loss": 0.80638492, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81703591, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.28344727, + "step": 4912, + "time_per_iteration": 2.713205575942993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006738, + "balance_loss_mlp": 0.99548489, + "epoch": 0.9451712196998846, + "flos": 1483371809280.0, + "grad_norm": 0.003561934378017574, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.7929408, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.11230469, + "step": 4913, + "time_per_iteration": 4.960860013961792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066622, + "balance_loss_mlp": 1.03829789, + "epoch": 0.9453636013851482, + "flos": 520885140480.0, + "grad_norm": 0.07082362828499891, + "language_loss": 0.90111738, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91178358, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.28344727, + "step": 4914, + "time_per_iteration": 2.635700225830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007926, + "balance_loss_mlp": 0.99667293, + "epoch": 0.9455559830704117, + "flos": 1496060706816.0, + "grad_norm": 0.003268488212439821, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84570277, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.11230469, + "step": 4915, + "time_per_iteration": 4.9683918952941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072161, + "balance_loss_mlp": 1.04393244, + "epoch": 0.9457483647556753, + "flos": 497871664128.0, + "grad_norm": 0.047809814034212056, + "language_loss": 0.81528771, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82600927, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.2824707, + "step": 4916, + "time_per_iteration": 2.639611005783081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065204, + "balance_loss_mlp": 1.0374043, + "epoch": 0.9459407464409388, + "flos": 1018979536896.0, + "grad_norm": 0.054134558028750085, + "language_loss": 0.82247525, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83312732, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.27807617, + "step": 4917, + "time_per_iteration": 3.4140350818634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065583, + "balance_loss_mlp": 1.03830767, + "epoch": 0.9461331281262024, + "flos": 513332315136.0, + "grad_norm": 0.0625234500918243, + "language_loss": 0.8119607, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82261658, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.27319336, + "step": 4918, + "time_per_iteration": 2.5912117958068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065806, + "balance_loss_mlp": 1.03698182, + "epoch": 0.9463255098114659, + "flos": 527768663040.0, + "grad_norm": 0.06365242386186536, + "language_loss": 0.78204429, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79270232, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.2878418, + "step": 4919, + "time_per_iteration": 2.6376984119415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.0388658, + "epoch": 0.9465178914967295, + "flos": 505798428672.0, + "grad_norm": 0.07305820868603019, + "language_loss": 0.83628333, + "learning_rate": 7.482341043430485e-06, + "loss": 0.8469547, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.28271484, + "step": 4920, + "time_per_iteration": 2.559981107711792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060734, + "balance_loss_mlp": 1.03264809, + "epoch": 0.9467102731819931, + "flos": 659934895104.0, + "grad_norm": 0.055619804981278775, + "language_loss": 0.85643375, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86704111, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.28100586, + "step": 4921, + "time_per_iteration": 2.871453046798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.0357666, + "epoch": 0.9469026548672567, + "flos": 674854281216.0, + "grad_norm": 0.055827613473534016, + "language_loss": 0.89702082, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90766174, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.28295898, + "step": 4922, + "time_per_iteration": 2.930006980895996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064475, + "balance_loss_mlp": 1.03593636, + "epoch": 0.9470950365525203, + "flos": 513701872128.0, + "grad_norm": 0.062350678546594374, + "language_loss": 0.79667199, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80731678, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.28540039, + "step": 4923, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065466, + "balance_loss_mlp": 1.03783345, + "epoch": 0.9472874182377837, + "flos": 549823265280.0, + "grad_norm": 0.050108759523029664, + "language_loss": 0.81262392, + "learning_rate": 7.269090441520132e-06, + "loss": 0.82327855, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.27661133, + "step": 4924, + "time_per_iteration": 2.7624218463897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.04025233, + "epoch": 0.9474797999230473, + "flos": 542510548992.0, + "grad_norm": 0.061582025232696735, + "language_loss": 0.79940867, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81008416, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.2734375, + "step": 4925, + "time_per_iteration": 2.63815975189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070421, + "balance_loss_mlp": 1.04262114, + "epoch": 0.9476721816083109, + "flos": 844291427328.0, + "grad_norm": 0.064300432731251, + "language_loss": 0.85653675, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.27807617, + "step": 4926, + "time_per_iteration": 3.1508045196533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063425, + "balance_loss_mlp": 1.03569698, + "epoch": 0.9478645632935745, + "flos": 637717349376.0, + "grad_norm": 0.059256067654064305, + "language_loss": 0.79014599, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80078024, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.27758789, + "step": 4927, + "time_per_iteration": 2.7674243450164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067601, + "balance_loss_mlp": 1.03920519, + "epoch": 0.948056944978838, + "flos": 656531716608.0, + "grad_norm": 0.05522948680571442, + "language_loss": 0.75659686, + "learning_rate": 7.058900559793469e-06, + "loss": 0.76727289, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.28417969, + "step": 4928, + "time_per_iteration": 2.807382583618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067187, + "balance_loss_mlp": 1.03938746, + "epoch": 0.9482493266641016, + "flos": 440676301824.0, + "grad_norm": 0.061938965827223864, + "language_loss": 0.83113259, + "learning_rate": 7.00683148031378e-06, + "loss": 0.8418045, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.27832031, + "step": 4929, + "time_per_iteration": 2.523789882659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065424, + "balance_loss_mlp": 1.03771996, + "epoch": 0.9484417083493651, + "flos": 545707113984.0, + "grad_norm": 0.06503778908082132, + "language_loss": 0.77616841, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.78682268, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.27709961, + "step": 4930, + "time_per_iteration": 2.8400285243988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067128, + "balance_loss_mlp": 1.03930449, + "epoch": 0.9486340900346287, + "flos": 538325996544.0, + "grad_norm": 0.049505853011934595, + "language_loss": 0.79665405, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80732536, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.27856445, + "step": 4931, + "time_per_iteration": 2.6870524883270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_mlp": 1.03999853, + "epoch": 0.9488264717198923, + "flos": 681362454528.0, + "grad_norm": 0.054233592359025716, + "language_loss": 0.85670519, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86739385, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.28881836, + "step": 4932, + "time_per_iteration": 2.857355833053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069295, + "balance_loss_mlp": 1.04092288, + "epoch": 0.9490188534051558, + "flos": 462365729280.0, + "grad_norm": 0.064073251907137, + "language_loss": 0.87887645, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.8895694, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.28369141, + "step": 4933, + "time_per_iteration": 2.540163993835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068966, + "balance_loss_mlp": 1.04067755, + "epoch": 0.9492112350904194, + "flos": 542865549312.0, + "grad_norm": 0.05372716069283064, + "language_loss": 0.82574224, + "learning_rate": 6.7493574384489e-06, + "loss": 0.83643186, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.28259277, + "step": 4934, + "time_per_iteration": 2.6855287551879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063867, + "balance_loss_mlp": 1.0359726, + "epoch": 0.949403616775683, + "flos": 550040053248.0, + "grad_norm": 0.052069086168931376, + "language_loss": 0.8394295, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85006821, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.27929688, + "step": 4935, + "time_per_iteration": 2.7103271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106989, + "balance_loss_mlp": 1.0421617, + "epoch": 0.9495959984609466, + "flos": 598105437696.0, + "grad_norm": 0.05030032999954777, + "language_loss": 0.82814801, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83884692, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.27758789, + "step": 4936, + "time_per_iteration": 2.770634889602661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065498, + "balance_loss_mlp": 1.0376029, + "epoch": 0.94978838014621, + "flos": 608130270720.0, + "grad_norm": 0.06038472870984303, + "language_loss": 0.82238394, + "learning_rate": 6.597170816132702e-06, + "loss": 0.83303893, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.27929688, + "step": 4937, + "time_per_iteration": 2.81235408782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068864, + "balance_loss_mlp": 1.04106474, + "epoch": 0.9499807618314736, + "flos": 540575424000.0, + "grad_norm": 0.07012929733727388, + "language_loss": 0.86437929, + "learning_rate": 6.546825027775427e-06, + "loss": 0.87506789, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.27832031, + "step": 4938, + "time_per_iteration": 2.6199066638946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066003, + "balance_loss_mlp": 1.03789318, + "epoch": 0.9501731435167372, + "flos": 594323937792.0, + "grad_norm": 0.0557301660975644, + "language_loss": 0.82896394, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83962405, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.28125, + "step": 4939, + "time_per_iteration": 2.7088263034820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064245, + "balance_loss_mlp": 1.0366118, + "epoch": 0.9503655252020008, + "flos": 453906464256.0, + "grad_norm": 0.057760924764302495, + "language_loss": 0.80044109, + "learning_rate": 6.446708197070161e-06, + "loss": 0.81108356, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.27685547, + "step": 4940, + "time_per_iteration": 2.6036903858184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066709, + "balance_loss_mlp": 1.03814626, + "epoch": 0.9505579068872644, + "flos": 667649253888.0, + "grad_norm": 0.06216222313569856, + "language_loss": 0.84629482, + "learning_rate": 6.396937193591079e-06, + "loss": 0.85696185, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.28540039, + "step": 4941, + "time_per_iteration": 2.8155128955841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070131, + "balance_loss_mlp": 1.04249859, + "epoch": 0.9507502885725279, + "flos": 401989768704.0, + "grad_norm": 0.07429194359954051, + "language_loss": 0.81656432, + "learning_rate": 6.347357823816235e-06, + "loss": 0.82726562, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.27661133, + "step": 4942, + "time_per_iteration": 2.4733738899230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064756, + "balance_loss_mlp": 1.03709936, + "epoch": 0.9509426702577914, + "flos": 700015288320.0, + "grad_norm": 0.05479314794150921, + "language_loss": 0.7956689, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80631644, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.27685547, + "step": 4943, + "time_per_iteration": 2.9666907787323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066084, + "balance_loss_mlp": 1.03816581, + "epoch": 0.951135051943055, + "flos": 501170125824.0, + "grad_norm": 0.05425923566819056, + "language_loss": 0.82572865, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83638954, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.27954102, + "step": 4944, + "time_per_iteration": 2.5886473655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068167, + "balance_loss_mlp": 1.03998554, + "epoch": 0.9513274336283186, + "flos": 614310994944.0, + "grad_norm": 0.049572920738515824, + "language_loss": 0.81490457, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.8255862, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.28149414, + "step": 4945, + "time_per_iteration": 2.938873767852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.04088187, + "epoch": 0.9515198153135821, + "flos": 519334129152.0, + "grad_norm": 0.07213408654984042, + "language_loss": 0.81845057, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82913423, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.27490234, + "step": 4946, + "time_per_iteration": 2.5683889389038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067353, + "balance_loss_mlp": 1.03955328, + "epoch": 0.9517121969988457, + "flos": 664622834688.0, + "grad_norm": 0.05349359226162988, + "language_loss": 0.76608801, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77676153, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.27807617, + "step": 4947, + "time_per_iteration": 2.931286573410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067386, + "balance_loss_mlp": 1.0390383, + "epoch": 0.9519045786841093, + "flos": 676108518912.0, + "grad_norm": 0.0631556824358652, + "language_loss": 0.75756991, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76824379, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.28344727, + "step": 4948, + "time_per_iteration": 2.8119544982910156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065174, + "balance_loss_mlp": 1.03739834, + "epoch": 0.9520969603693729, + "flos": 652593065472.0, + "grad_norm": 0.05176605196525789, + "language_loss": 0.80436432, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81501603, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.27832031, + "step": 4949, + "time_per_iteration": 2.8334755897521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068678, + "balance_loss_mlp": 1.04095006, + "epoch": 0.9522893420546364, + "flos": 742935836160.0, + "grad_norm": 0.04702530547499014, + "language_loss": 0.83160955, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84229636, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.27783203, + "step": 4950, + "time_per_iteration": 3.024099111557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066354, + "balance_loss_mlp": 1.0383395, + "epoch": 0.9524817237398999, + "flos": 761364679680.0, + "grad_norm": 0.055590433220462955, + "language_loss": 0.80557394, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81623745, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.28027344, + "step": 4951, + "time_per_iteration": 2.9261345863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063968, + "balance_loss_mlp": 1.03624022, + "epoch": 0.9526741054251635, + "flos": 528871541760.0, + "grad_norm": 0.06028024445787797, + "language_loss": 0.81832278, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.82896245, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.27783203, + "step": 4952, + "time_per_iteration": 2.601012706756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067093, + "balance_loss_mlp": 1.03907871, + "epoch": 0.9528664871104271, + "flos": 488196039168.0, + "grad_norm": 0.08007516789791078, + "language_loss": 0.80964506, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82031596, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.28051758, + "step": 4953, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010662, + "balance_loss_mlp": 1.03804255, + "epoch": 0.9530588687956907, + "flos": 517464433152.0, + "grad_norm": 0.06660623394003432, + "language_loss": 0.85304189, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86370385, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.28198242, + "step": 4954, + "time_per_iteration": 2.7781050205230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066312, + "balance_loss_mlp": 1.03772521, + "epoch": 0.9532512504809542, + "flos": 674833932288.0, + "grad_norm": 0.056693610090972645, + "language_loss": 0.8034358, + "learning_rate": 5.720273340271864e-06, + "loss": 0.81409889, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.28588867, + "step": 4955, + "time_per_iteration": 2.8433279991149902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_mlp": 1.03934515, + "epoch": 0.9534436321662177, + "flos": 489269804544.0, + "grad_norm": 0.05291619762333268, + "language_loss": 0.83936781, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85004044, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.27905273, + "step": 4956, + "time_per_iteration": 2.5828912258148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066496, + "balance_loss_mlp": 1.03788543, + "epoch": 0.9536360138514813, + "flos": 496335209472.0, + "grad_norm": 0.06235854492095354, + "language_loss": 0.81562638, + "learning_rate": 5.626676233493167e-06, + "loss": 0.82629132, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.28613281, + "step": 4957, + "time_per_iteration": 2.669546127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066638, + "balance_loss_mlp": 1.03895712, + "epoch": 0.9538283955367449, + "flos": 801114803712.0, + "grad_norm": 0.053809767335559436, + "language_loss": 0.84141076, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85207713, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.27685547, + "step": 4958, + "time_per_iteration": 3.0569889545440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064573, + "balance_loss_mlp": 1.0356288, + "epoch": 0.9540207772220085, + "flos": 556386693120.0, + "grad_norm": 0.04933735095263698, + "language_loss": 0.79818612, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80883187, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.2890625, + "step": 4959, + "time_per_iteration": 2.7846500873565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066182, + "balance_loss_mlp": 1.03797746, + "epoch": 0.954213158907272, + "flos": 684193844736.0, + "grad_norm": 0.0573976228182319, + "language_loss": 0.81808335, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82874513, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.28198242, + "step": 4960, + "time_per_iteration": 2.916274070739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067802, + "balance_loss_mlp": 1.03940582, + "epoch": 0.9544055405925356, + "flos": 535480049664.0, + "grad_norm": 0.12489923707729335, + "language_loss": 0.82927817, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83995616, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.28393555, + "step": 4961, + "time_per_iteration": 2.715878486633301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068036, + "balance_loss_mlp": 1.03980756, + "epoch": 0.9545979222777992, + "flos": 825025955328.0, + "grad_norm": 0.06585354044225371, + "language_loss": 0.80001307, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81069338, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.2824707, + "step": 4962, + "time_per_iteration": 3.1188926696777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070168, + "balance_loss_mlp": 1.04198718, + "epoch": 0.9547903039630627, + "flos": 761326801920.0, + "grad_norm": 0.06050362430741012, + "language_loss": 0.76945174, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78015339, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.28198242, + "step": 4963, + "time_per_iteration": 3.081576347351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_mlp": 1.041991, + "epoch": 0.9549826856483262, + "flos": 515050652160.0, + "grad_norm": 0.06425554688456968, + "language_loss": 0.82589138, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83658552, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.2746582, + "step": 4964, + "time_per_iteration": 2.5883357524871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068863, + "balance_loss_mlp": 1.04189777, + "epoch": 0.9551750673335898, + "flos": 642818515968.0, + "grad_norm": 0.061836123206944746, + "language_loss": 0.82252514, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83321381, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.27001953, + "step": 4965, + "time_per_iteration": 2.807131767272949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068448, + "balance_loss_mlp": 1.03993297, + "epoch": 0.9553674490188534, + "flos": 471967160832.0, + "grad_norm": 0.05936300763457571, + "language_loss": 0.82923341, + "learning_rate": 5.214991993520546e-06, + "loss": 0.8399179, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.28491211, + "step": 4966, + "time_per_iteration": 2.5980896949768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.04150367, + "epoch": 0.955559830704117, + "flos": 528064026624.0, + "grad_norm": 0.08134141951074082, + "language_loss": 0.81711161, + "learning_rate": 5.170209528521763e-06, + "loss": 0.82779801, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.27197266, + "step": 4967, + "time_per_iteration": 2.6179404258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064015, + "balance_loss_mlp": 1.0356431, + "epoch": 0.9557522123893806, + "flos": 547907079168.0, + "grad_norm": 0.06225562192484809, + "language_loss": 0.84138858, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85202879, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.28393555, + "step": 4968, + "time_per_iteration": 2.667945384979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066382, + "balance_loss_mlp": 1.0384872, + "epoch": 0.955944594074644, + "flos": 509201607168.0, + "grad_norm": 0.05278263186963013, + "language_loss": 0.81962323, + "learning_rate": 5.08122094572222e-06, + "loss": 0.8302871, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.27905273, + "step": 4969, + "time_per_iteration": 2.6727488040924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067, + "balance_loss_mlp": 1.03881836, + "epoch": 0.9561369757599076, + "flos": 527297209344.0, + "grad_norm": 0.052104090263610174, + "language_loss": 0.79543424, + "learning_rate": 5.037014862469824e-06, + "loss": 0.8061043, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.28198242, + "step": 4970, + "time_per_iteration": 2.760735511779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063905, + "balance_loss_mlp": 1.03610492, + "epoch": 0.9563293574451712, + "flos": 497950239744.0, + "grad_norm": 0.0557276302945241, + "language_loss": 0.80518448, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81582344, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.27807617, + "step": 4971, + "time_per_iteration": 2.605243444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008683, + "balance_loss_mlp": 0.99752527, + "epoch": 0.9565217391304348, + "flos": 1408160982528.0, + "grad_norm": 0.004764129085001868, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82782245, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.11181641, + "step": 4972, + "time_per_iteration": 4.912391901016235 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064245, + "balance_loss_mlp": 1.03580165, + "epoch": 0.9567141208156984, + "flos": 503588289024.0, + "grad_norm": 0.05341140785738964, + "language_loss": 0.78160602, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79224843, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.28442383, + "step": 4973, + "time_per_iteration": 2.7303390502929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065708, + "balance_loss_mlp": 1.03771734, + "epoch": 0.9569065025009619, + "flos": 432985264128.0, + "grad_norm": 0.07164961386667579, + "language_loss": 0.79847026, + "learning_rate": 4.86211231669359e-06, + "loss": 0.80912733, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.2800293, + "step": 4974, + "time_per_iteration": 2.531446933746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067013, + "balance_loss_mlp": 1.03947544, + "epoch": 0.9570988841862255, + "flos": 589662139392.0, + "grad_norm": 0.06516120913599614, + "language_loss": 0.78293043, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79360056, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.27563477, + "step": 4975, + "time_per_iteration": 2.7902753353118896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106743, + "balance_loss_mlp": 1.0391773, + "epoch": 0.957291265871489, + "flos": 766938710016.0, + "grad_norm": 0.06514295533680022, + "language_loss": 0.78948712, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.80016142, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.2824707, + "step": 4976, + "time_per_iteration": 3.0192434787750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067896, + "balance_loss_mlp": 1.03983414, + "epoch": 0.9574836475567526, + "flos": 638820228096.0, + "grad_norm": 0.06668158886140403, + "language_loss": 0.844226, + "learning_rate": 4.732953758233849e-06, + "loss": 0.85490495, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.28051758, + "step": 4977, + "time_per_iteration": 2.796856641769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010322, + "balance_loss_mlp": 0.99916387, + "epoch": 0.9576760292420161, + "flos": 1575077916672.0, + "grad_norm": 0.005308637901779806, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79617584, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.11181641, + "step": 4978, + "time_per_iteration": 4.921823978424072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_mlp": 1.03912115, + "epoch": 0.9578684109272797, + "flos": 496089308160.0, + "grad_norm": 0.05441807345174081, + "language_loss": 0.87236488, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88304389, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.28759766, + "step": 4979, + "time_per_iteration": 2.70119047164917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071898, + "balance_loss_mlp": 1.04264426, + "epoch": 0.9580607926125433, + "flos": 429730472448.0, + "grad_norm": 0.06759599092224589, + "language_loss": 0.85242122, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86314023, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.29223633, + "step": 4980, + "time_per_iteration": 2.492082357406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064793, + "balance_loss_mlp": 1.03682661, + "epoch": 0.9582531742978069, + "flos": 1126796659200.0, + "grad_norm": 0.056689820580710266, + "language_loss": 0.79991627, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81056416, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.27978516, + "step": 4981, + "time_per_iteration": 3.57839298248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065826, + "balance_loss_mlp": 1.0384798, + "epoch": 0.9584455559830705, + "flos": 524185012224.0, + "grad_norm": 0.0491084118280761, + "language_loss": 0.79095042, + "learning_rate": 4.521535307661085e-06, + "loss": 0.80160868, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.27392578, + "step": 4982, + "time_per_iteration": 2.6562139987945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067885, + "balance_loss_mlp": 1.04049063, + "epoch": 0.9586379376683339, + "flos": 633873240576.0, + "grad_norm": 0.05909810114288763, + "language_loss": 0.80548841, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81616724, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.27416992, + "step": 4983, + "time_per_iteration": 2.884284019470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064802, + "balance_loss_mlp": 1.03683555, + "epoch": 0.9588303193535975, + "flos": 415831007232.0, + "grad_norm": 0.06012496552815453, + "language_loss": 0.83002317, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84067118, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.2800293, + "step": 4984, + "time_per_iteration": 2.531792640686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066004, + "balance_loss_mlp": 1.03756058, + "epoch": 0.9590227010388611, + "flos": 481440554496.0, + "grad_norm": 0.059119169486773586, + "language_loss": 0.77985901, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.790519, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.28442383, + "step": 4985, + "time_per_iteration": 2.565157651901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066791, + "balance_loss_mlp": 1.03853846, + "epoch": 0.9592150827241247, + "flos": 684214193664.0, + "grad_norm": 0.05981675805708547, + "language_loss": 0.80249083, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81315875, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.28271484, + "step": 4986, + "time_per_iteration": 2.948621988296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063768, + "balance_loss_mlp": 1.03556311, + "epoch": 0.9594074644093882, + "flos": 574205870592.0, + "grad_norm": 0.053606231340170674, + "language_loss": 0.71040821, + "learning_rate": 4.314925898349642e-06, + "loss": 0.72104591, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.28222656, + "step": 4987, + "time_per_iteration": 2.713947296142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067964, + "balance_loss_mlp": 1.03992605, + "epoch": 0.9595998460946518, + "flos": 546593204736.0, + "grad_norm": 0.06105815634499886, + "language_loss": 0.78293216, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79361176, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.28051758, + "step": 4988, + "time_per_iteration": 2.7715773582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064589, + "balance_loss_mlp": 1.03650284, + "epoch": 0.9597922277799154, + "flos": 473798979072.0, + "grad_norm": 0.08864611353116542, + "language_loss": 0.78130996, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79195589, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.28100586, + "step": 4989, + "time_per_iteration": 2.594235897064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_mlp": 1.03401875, + "epoch": 0.9599846094651789, + "flos": 514435193856.0, + "grad_norm": 0.05622217854933262, + "language_loss": 0.8567155, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86733532, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.28027344, + "step": 4990, + "time_per_iteration": 2.6104650497436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063049, + "balance_loss_mlp": 1.03498709, + "epoch": 0.9601769911504425, + "flos": 594689112576.0, + "grad_norm": 0.08881428945062002, + "language_loss": 0.78393328, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79456377, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.28076172, + "step": 4991, + "time_per_iteration": 2.7895936965942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066773, + "balance_loss_mlp": 1.03847301, + "epoch": 0.960369372835706, + "flos": 492755940864.0, + "grad_norm": 0.04867717103170429, + "language_loss": 0.79372609, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80439377, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.28320312, + "step": 4992, + "time_per_iteration": 2.6212775707244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066933, + "balance_loss_mlp": 1.03891885, + "epoch": 0.9605617545209696, + "flos": 579016055808.0, + "grad_norm": 0.05457191695661726, + "language_loss": 0.82460308, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83527243, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.28051758, + "step": 4993, + "time_per_iteration": 2.759636640548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106556, + "balance_loss_mlp": 1.03795075, + "epoch": 0.9607541362062332, + "flos": 927312717312.0, + "grad_norm": 0.048885736648258904, + "language_loss": 0.86471546, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87537098, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.27661133, + "step": 4994, + "time_per_iteration": 3.2854697704315186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070175, + "balance_loss_mlp": 1.04208946, + "epoch": 0.9609465178914968, + "flos": 572832359424.0, + "grad_norm": 0.05525276534284053, + "language_loss": 0.75332189, + "learning_rate": 3.994358637073036e-06, + "loss": 0.7640236, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.28100586, + "step": 4995, + "time_per_iteration": 2.8103957176208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_mlp": 1.03918266, + "epoch": 0.9611388995767602, + "flos": 530585496576.0, + "grad_norm": 0.055668397628729924, + "language_loss": 0.85367101, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86434674, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.28393555, + "step": 4996, + "time_per_iteration": 2.683131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058352, + "balance_loss_mlp": 1.0309813, + "epoch": 0.9613312812620238, + "flos": 645959826432.0, + "grad_norm": 0.06526424456428359, + "language_loss": 0.82228351, + "learning_rate": 3.916142178097881e-06, + "loss": 0.83286703, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.27416992, + "step": 4997, + "time_per_iteration": 2.7618255615234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066452, + "balance_loss_mlp": 1.03891504, + "epoch": 0.9615236629472874, + "flos": 495897251328.0, + "grad_norm": 0.05310297597854665, + "language_loss": 0.77744323, + "learning_rate": 3.877322836288888e-06, + "loss": 0.78810775, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.27563477, + "step": 4998, + "time_per_iteration": 2.863664388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106706, + "balance_loss_mlp": 1.03799713, + "epoch": 0.961716044632551, + "flos": 512716856832.0, + "grad_norm": 0.0744319505918789, + "language_loss": 0.75606596, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76673657, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.29052734, + "step": 4999, + "time_per_iteration": 2.659785032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072002, + "balance_loss_mlp": 1.04420233, + "epoch": 0.9619084263178146, + "flos": 500835474432.0, + "grad_norm": 0.06374446062108947, + "language_loss": 0.8034153, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81413531, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.27832031, + "step": 5000, + "time_per_iteration": 2.5630085468292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063755, + "balance_loss_mlp": 1.03512073, + "epoch": 0.9621008080030781, + "flos": 595343858688.0, + "grad_norm": 0.08471732085322128, + "language_loss": 0.7496736, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.76031113, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.28613281, + "step": 5001, + "time_per_iteration": 2.7929296493530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066628, + "balance_loss_mlp": 1.03811276, + "epoch": 0.9622931896883417, + "flos": 502002372096.0, + "grad_norm": 0.0587872194005596, + "language_loss": 0.82325351, + "learning_rate": 3.723971737693899e-06, + "loss": 0.83391976, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.28491211, + "step": 5002, + "time_per_iteration": 2.629521131515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064966, + "balance_loss_mlp": 1.0366652, + "epoch": 0.9624855713736052, + "flos": 606998278656.0, + "grad_norm": 0.06325172707319822, + "language_loss": 0.80725789, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81790757, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.28320312, + "step": 5003, + "time_per_iteration": 2.839571952819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065227, + "balance_loss_mlp": 1.03747535, + "epoch": 0.9626779530588688, + "flos": 510461637120.0, + "grad_norm": 0.06727283899575592, + "language_loss": 0.84707081, + "learning_rate": 3.648452157695936e-06, + "loss": 0.85772312, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.27758789, + "step": 5004, + "time_per_iteration": 2.6041605472564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010647, + "balance_loss_mlp": 1.03730631, + "epoch": 0.9628703347441323, + "flos": 626994100224.0, + "grad_norm": 0.055831199103682276, + "language_loss": 0.8231709, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.27441406, + "step": 5005, + "time_per_iteration": 2.8136613368988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066818, + "balance_loss_mlp": 1.03806448, + "epoch": 0.9630627164293959, + "flos": 630474444288.0, + "grad_norm": 0.05495272478085719, + "language_loss": 0.774104, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78477216, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.28735352, + "step": 5006, + "time_per_iteration": 2.769972324371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067484, + "balance_loss_mlp": 1.03994679, + "epoch": 0.9632550981146595, + "flos": 570267219456.0, + "grad_norm": 0.05396101102886816, + "language_loss": 0.78515279, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79582763, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.27587891, + "step": 5007, + "time_per_iteration": 2.833217144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062742, + "balance_loss_mlp": 1.03394079, + "epoch": 0.9634474797999231, + "flos": 465857657856.0, + "grad_norm": 0.05608554449489955, + "language_loss": 0.80852854, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.81915593, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.28808594, + "step": 5008, + "time_per_iteration": 2.6398768424987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064534, + "balance_loss_mlp": 1.03694844, + "epoch": 0.9636398614851867, + "flos": 526345689600.0, + "grad_norm": 0.05947227512115279, + "language_loss": 0.85232651, + "learning_rate": 3.463025724284974e-06, + "loss": 0.8629719, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.27612305, + "step": 5009, + "time_per_iteration": 2.6193339824676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064175, + "balance_loss_mlp": 1.03592229, + "epoch": 0.9638322431704501, + "flos": 564554976768.0, + "grad_norm": 0.057419474894705454, + "language_loss": 0.75136191, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76200366, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.28271484, + "step": 5010, + "time_per_iteration": 2.8186190128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.03835249, + "epoch": 0.9640246248557137, + "flos": 477531016704.0, + "grad_norm": 0.11381050052221461, + "language_loss": 0.84410369, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85477066, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.28320312, + "step": 5011, + "time_per_iteration": 2.607623338699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065872, + "balance_loss_mlp": 1.03831065, + "epoch": 0.9642170065409773, + "flos": 539063700480.0, + "grad_norm": 0.057233359656352366, + "language_loss": 0.88353223, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89419091, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.27563477, + "step": 5012, + "time_per_iteration": 2.6468701362609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063888, + "balance_loss_mlp": 1.03625488, + "epoch": 0.9644093882262409, + "flos": 523499742720.0, + "grad_norm": 0.056318288112839024, + "language_loss": 0.83765054, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84828949, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.27636719, + "step": 5013, + "time_per_iteration": 2.6283926963806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067391, + "balance_loss_mlp": 1.03994918, + "epoch": 0.9646017699115044, + "flos": 574018195968.0, + "grad_norm": 0.06680838012319379, + "language_loss": 0.78578639, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79646027, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.2746582, + "step": 5014, + "time_per_iteration": 2.757387638092041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065697, + "balance_loss_mlp": 1.03832626, + "epoch": 0.964794151596768, + "flos": 636511163904.0, + "grad_norm": 0.06470310275941542, + "language_loss": 0.8431797, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85383666, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.27416992, + "step": 5015, + "time_per_iteration": 2.7526612281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_mlp": 1.03885484, + "epoch": 0.9649865332820315, + "flos": 617155531776.0, + "grad_norm": 0.07525409199590156, + "language_loss": 0.86100334, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87166679, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.27539062, + "step": 5016, + "time_per_iteration": 2.711585283279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069407, + "balance_loss_mlp": 1.04113102, + "epoch": 0.9651789149672951, + "flos": 515898865152.0, + "grad_norm": 0.05644315482934111, + "language_loss": 0.8094486, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82014269, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.28295898, + "step": 5017, + "time_per_iteration": 2.783452033996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106516, + "balance_loss_mlp": 1.03750336, + "epoch": 0.9653712966525587, + "flos": 492696304128.0, + "grad_norm": 0.05618640768914361, + "language_loss": 0.79814726, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.80879885, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.27661133, + "step": 5018, + "time_per_iteration": 2.5714142322540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066299, + "balance_loss_mlp": 1.03776038, + "epoch": 0.9655636783378222, + "flos": 536287564800.0, + "grad_norm": 0.3262600560454796, + "language_loss": 0.821886, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83254898, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.28540039, + "step": 5019, + "time_per_iteration": 2.7656137943267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_mlp": 1.03787422, + "epoch": 0.9657560600230858, + "flos": 458790842880.0, + "grad_norm": 0.07131607326554101, + "language_loss": 0.81939691, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83005363, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.27832031, + "step": 5020, + "time_per_iteration": 2.7424540519714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.03724039, + "epoch": 0.9659484417083494, + "flos": 685877276160.0, + "grad_norm": 0.06616301345736442, + "language_loss": 0.8344838, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84512877, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.27246094, + "step": 5021, + "time_per_iteration": 2.813933849334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009634, + "balance_loss_mlp": 0.99847621, + "epoch": 0.966140823393613, + "flos": 1501503879168.0, + "grad_norm": 0.004878091827342763, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81703877, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.11181641, + "step": 5022, + "time_per_iteration": 4.681534767150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066627, + "balance_loss_mlp": 1.03932786, + "epoch": 0.9663332050788765, + "flos": 464660236800.0, + "grad_norm": 0.0669391834816262, + "language_loss": 0.81136465, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.8220309, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.27319336, + "step": 5023, + "time_per_iteration": 2.6037216186523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071183, + "balance_loss_mlp": 1.04376459, + "epoch": 0.96652558676414, + "flos": 500575016448.0, + "grad_norm": 0.05770087966529414, + "language_loss": 0.85576534, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.86647713, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.2746582, + "step": 5024, + "time_per_iteration": 2.648139715194702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062992, + "balance_loss_mlp": 1.03476286, + "epoch": 0.9667179684494036, + "flos": 424614749184.0, + "grad_norm": 0.07347131630745982, + "language_loss": 0.82613868, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83676857, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.28222656, + "step": 5025, + "time_per_iteration": 2.4518802165985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067612, + "balance_loss_mlp": 1.03919196, + "epoch": 0.9669103501346672, + "flos": 516744258048.0, + "grad_norm": 0.056054793456989736, + "language_loss": 0.85796893, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86864507, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.28417969, + "step": 5026, + "time_per_iteration": 2.6519358158111572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064191, + "balance_loss_mlp": 1.03581882, + "epoch": 0.9671027318199308, + "flos": 456008914944.0, + "grad_norm": 0.07661244422718277, + "language_loss": 0.75568247, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.7663244, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.28393555, + "step": 5027, + "time_per_iteration": 2.6021740436553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063493, + "balance_loss_mlp": 1.03574109, + "epoch": 0.9672951135051943, + "flos": 524809234944.0, + "grad_norm": 0.05603975558530982, + "language_loss": 0.79859215, + "learning_rate": 2.802372171957057e-06, + "loss": 0.80922711, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.27783203, + "step": 5028, + "time_per_iteration": 2.653294086456299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062661, + "balance_loss_mlp": 1.03440905, + "epoch": 0.9674874951904578, + "flos": 573708275712.0, + "grad_norm": 0.05632883535344154, + "language_loss": 0.79708344, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80771005, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.2824707, + "step": 5029, + "time_per_iteration": 2.8485989570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064168, + "balance_loss_mlp": 1.03603494, + "epoch": 0.9676798768757214, + "flos": 628875380736.0, + "grad_norm": 0.05249570789540728, + "language_loss": 0.79920137, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80984306, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.28125, + "step": 5030, + "time_per_iteration": 2.9783546924591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008577, + "balance_loss_mlp": 0.99732375, + "epoch": 0.967872258560985, + "flos": 1463074831872.0, + "grad_norm": 0.0047902064985316075, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76571935, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.11230469, + "step": 5031, + "time_per_iteration": 4.6512672901153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_mlp": 1.04088974, + "epoch": 0.9680646402462486, + "flos": 565238836224.0, + "grad_norm": 0.06439989216377716, + "language_loss": 0.78775156, + "learning_rate": 2.672163531181049e-06, + "loss": 0.79843849, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.27832031, + "step": 5032, + "time_per_iteration": 2.711900234222412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008099, + "balance_loss_mlp": 0.9968459, + "epoch": 0.9682570219315121, + "flos": 1433669635584.0, + "grad_norm": 0.004495052904339459, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79082906, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.11230469, + "step": 5033, + "time_per_iteration": 4.819545030593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064787, + "balance_loss_mlp": 1.03727293, + "epoch": 0.9684494036167757, + "flos": 584338392576.0, + "grad_norm": 0.0648336777486898, + "language_loss": 0.81837499, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82902288, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.27539062, + "step": 5034, + "time_per_iteration": 2.6948647499084473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066131, + "balance_loss_mlp": 1.03728223, + "epoch": 0.9686417853020393, + "flos": 558784507392.0, + "grad_norm": 0.09835762498909857, + "language_loss": 0.84009242, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.85075378, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.28833008, + "step": 5035, + "time_per_iteration": 2.681332588195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.03638136, + "epoch": 0.9688341669873028, + "flos": 784594944000.0, + "grad_norm": 0.07041823637158158, + "language_loss": 0.83102357, + "learning_rate": 2.545044165539745e-06, + "loss": 0.84167081, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.28369141, + "step": 5036, + "time_per_iteration": 2.968900203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.03979087, + "epoch": 0.9690265486725663, + "flos": 395682416640.0, + "grad_norm": 0.058095180811742086, + "language_loss": 0.79474586, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80542266, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.27929688, + "step": 5037, + "time_per_iteration": 2.4735050201416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070976, + "balance_loss_mlp": 1.0435822, + "epoch": 0.9692189303578299, + "flos": 476113835520.0, + "grad_norm": 0.06775264722732154, + "language_loss": 0.77614433, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.78685409, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.27416992, + "step": 5038, + "time_per_iteration": 2.728487253189087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063713, + "balance_loss_mlp": 1.0362711, + "epoch": 0.9694113120430935, + "flos": 597297922560.0, + "grad_norm": 0.06545146976604356, + "language_loss": 0.78883851, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79947555, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.27490234, + "step": 5039, + "time_per_iteration": 2.747343063354492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061735, + "balance_loss_mlp": 1.03448391, + "epoch": 0.9696036937283571, + "flos": 500628860928.0, + "grad_norm": 0.05842492714952315, + "language_loss": 0.82463217, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83524954, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.27294922, + "step": 5040, + "time_per_iteration": 2.5741090774536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065285, + "balance_loss_mlp": 1.036937, + "epoch": 0.9697960754136207, + "flos": 432049711104.0, + "grad_norm": 0.06899414406463689, + "language_loss": 0.87255681, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88320959, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.28369141, + "step": 5041, + "time_per_iteration": 2.4628825187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069848, + "balance_loss_mlp": 1.04042697, + "epoch": 0.9699884570988841, + "flos": 568257901056.0, + "grad_norm": 0.050873067640045296, + "language_loss": 0.85379595, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.86449444, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.29418945, + "step": 5042, + "time_per_iteration": 2.7318944931030273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069353, + "balance_loss_mlp": 1.04193473, + "epoch": 0.9701808387841477, + "flos": 515961474048.0, + "grad_norm": 0.06956938188967421, + "language_loss": 0.81409943, + "learning_rate": 2.33002120820458e-06, + "loss": 0.82479298, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.27441406, + "step": 5043, + "time_per_iteration": 2.65865421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065727, + "balance_loss_mlp": 1.0380702, + "epoch": 0.9703732204694113, + "flos": 491273330688.0, + "grad_norm": 0.07297009392614884, + "language_loss": 0.75900912, + "learning_rate": 2.300076399000206e-06, + "loss": 0.76966637, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.27661133, + "step": 5044, + "time_per_iteration": 2.5922508239746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064819, + "balance_loss_mlp": 1.03687608, + "epoch": 0.9705656021546749, + "flos": 625831584768.0, + "grad_norm": 0.058526336154578064, + "language_loss": 0.79872143, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.80936968, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.27978516, + "step": 5045, + "time_per_iteration": 2.785860061645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066292, + "balance_loss_mlp": 1.03796744, + "epoch": 0.9707579838399384, + "flos": 471198933504.0, + "grad_norm": 0.05613638274130696, + "language_loss": 0.82710165, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.83776456, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.28320312, + "step": 5046, + "time_per_iteration": 2.6305992603302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063938, + "balance_loss_mlp": 1.0362339, + "epoch": 0.970950365525202, + "flos": 491845118976.0, + "grad_norm": 0.08794096275346511, + "language_loss": 0.80495691, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81559622, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.27709961, + "step": 5047, + "time_per_iteration": 2.671323299407959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.03623736, + "epoch": 0.9711427472104656, + "flos": 557060378112.0, + "grad_norm": 0.060777831648666195, + "language_loss": 0.80575037, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81639123, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.27880859, + "step": 5048, + "time_per_iteration": 2.6912620067596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067773, + "balance_loss_mlp": 1.04013991, + "epoch": 0.9713351288957291, + "flos": 625527456768.0, + "grad_norm": 0.05303633777946519, + "language_loss": 0.8379271, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84860486, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.27661133, + "step": 5049, + "time_per_iteration": 2.930760622024536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067092, + "balance_loss_mlp": 1.04017484, + "epoch": 0.9715275105809927, + "flos": 498821773824.0, + "grad_norm": 0.05437490593151225, + "language_loss": 0.80818999, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.81886101, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.26977539, + "step": 5050, + "time_per_iteration": 2.7364494800567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.03714252, + "epoch": 0.9717198922662562, + "flos": 477274940928.0, + "grad_norm": 0.06615210996370888, + "language_loss": 0.77408063, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78472841, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.27685547, + "step": 5051, + "time_per_iteration": 2.6002469062805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064465, + "balance_loss_mlp": 1.03578305, + "epoch": 0.9719122739515198, + "flos": 553171189248.0, + "grad_norm": 0.05273950962157412, + "language_loss": 0.78674865, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79739332, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.28710938, + "step": 5052, + "time_per_iteration": 2.708221197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062301, + "balance_loss_mlp": 1.03540766, + "epoch": 0.9721046556367834, + "flos": 565583662080.0, + "grad_norm": 0.2131840589845717, + "language_loss": 0.79749233, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80811536, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.26940918, + "step": 5053, + "time_per_iteration": 2.6769378185272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067739, + "balance_loss_mlp": 1.03965354, + "epoch": 0.972297037322047, + "flos": 560044537344.0, + "grad_norm": 0.06115965069395946, + "language_loss": 0.7824676, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79314494, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.28051758, + "step": 5054, + "time_per_iteration": 2.780709981918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064914, + "balance_loss_mlp": 1.03682876, + "epoch": 0.9724894190073105, + "flos": 512175591936.0, + "grad_norm": 0.06000493777868893, + "language_loss": 0.79179239, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.8024416, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.28100586, + "step": 5055, + "time_per_iteration": 2.679389238357544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065232, + "balance_loss_mlp": 1.03743291, + "epoch": 0.972681800692574, + "flos": 613532593152.0, + "grad_norm": 0.09595549516744886, + "language_loss": 0.80428839, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81494069, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.27832031, + "step": 5056, + "time_per_iteration": 2.790769338607788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067435, + "balance_loss_mlp": 1.03927755, + "epoch": 0.9728741823778376, + "flos": 833562385920.0, + "grad_norm": 0.05624518415206626, + "language_loss": 0.83850849, + "learning_rate": 1.92838141509849e-06, + "loss": 0.84918284, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.28149414, + "step": 5057, + "time_per_iteration": 3.0661802291870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_mlp": 1.03324711, + "epoch": 0.9730665640631012, + "flos": 571167866880.0, + "grad_norm": 0.061543355235248995, + "language_loss": 0.84603822, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85666203, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.29077148, + "step": 5058, + "time_per_iteration": 2.7378501892089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063259, + "balance_loss_mlp": 1.03510189, + "epoch": 0.9732589457483648, + "flos": 506271292416.0, + "grad_norm": 0.05909256512343959, + "language_loss": 0.7731396, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78377223, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.28173828, + "step": 5059, + "time_per_iteration": 2.597114324569702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067045, + "balance_loss_mlp": 1.039222, + "epoch": 0.9734513274336283, + "flos": 926602716672.0, + "grad_norm": 0.05633726130728, + "language_loss": 0.80202436, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81269479, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.27856445, + "step": 5060, + "time_per_iteration": 3.1069252490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066984, + "balance_loss_mlp": 1.0390172, + "epoch": 0.9736437091188919, + "flos": 491682175488.0, + "grad_norm": 0.04955029488996079, + "language_loss": 0.78345102, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.2800293, + "step": 5061, + "time_per_iteration": 2.7237606048583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068144, + "balance_loss_mlp": 1.04027295, + "epoch": 0.9738360908041555, + "flos": 613039380480.0, + "grad_norm": 0.0670398565669916, + "language_loss": 0.83701253, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84769392, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.27856445, + "step": 5062, + "time_per_iteration": 2.705859422683716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008012, + "balance_loss_mlp": 0.99685371, + "epoch": 0.974028472489419, + "flos": 1548771922944.0, + "grad_norm": 0.004487982197495449, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.77000105, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.11181641, + "step": 5063, + "time_per_iteration": 4.964916229248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008017, + "balance_loss_mlp": 0.99685866, + "epoch": 0.9742208541746825, + "flos": 1410403055616.0, + "grad_norm": 0.004488989410680284, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80685735, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.11181641, + "step": 5064, + "time_per_iteration": 4.9454896450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065951, + "balance_loss_mlp": 1.03760338, + "epoch": 0.9744132358599461, + "flos": 674582238720.0, + "grad_norm": 0.047234166285075166, + "language_loss": 0.76724768, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77790713, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.28344727, + "step": 5065, + "time_per_iteration": 2.85241961479187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063551, + "balance_loss_mlp": 1.03520298, + "epoch": 0.9746056175452097, + "flos": 598111229952.0, + "grad_norm": 0.05284767537793641, + "language_loss": 0.77460915, + "learning_rate": 1.690196122544896e-06, + "loss": 0.7852447, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.28344727, + "step": 5066, + "time_per_iteration": 2.778743028640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106678, + "balance_loss_mlp": 1.03852713, + "epoch": 0.9747979992304733, + "flos": 731837237760.0, + "grad_norm": 0.056939987463815324, + "language_loss": 0.82497215, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83563995, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.2824707, + "step": 5067, + "time_per_iteration": 2.985030174255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069279, + "balance_loss_mlp": 1.04045403, + "epoch": 0.9749903809157369, + "flos": 616219978752.0, + "grad_norm": 0.06983302671327438, + "language_loss": 0.76487023, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.775563, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.28808594, + "step": 5068, + "time_per_iteration": 2.6938107013702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106283, + "balance_loss_mlp": 1.03483963, + "epoch": 0.9751827626010003, + "flos": 468160929792.0, + "grad_norm": 0.05087339231856929, + "language_loss": 0.83533263, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84596097, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.2800293, + "step": 5069, + "time_per_iteration": 2.602464437484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_mlp": 1.04174674, + "epoch": 0.9753751442862639, + "flos": 598918745088.0, + "grad_norm": 0.06914796858468633, + "language_loss": 0.85062265, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86132151, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.28149414, + "step": 5070, + "time_per_iteration": 2.7954771518707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066097, + "balance_loss_mlp": 1.03777337, + "epoch": 0.9755675259715275, + "flos": 650486232576.0, + "grad_norm": 0.051994743985587635, + "language_loss": 0.82142699, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.83208799, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.28344727, + "step": 5071, + "time_per_iteration": 2.8875765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063694, + "balance_loss_mlp": 1.03646636, + "epoch": 0.9757599076567911, + "flos": 563392461312.0, + "grad_norm": 0.10792269115759393, + "language_loss": 0.79117143, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80180836, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.27294922, + "step": 5072, + "time_per_iteration": 2.6715874671936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066623, + "balance_loss_mlp": 1.03808391, + "epoch": 0.9759522893420547, + "flos": 504385629696.0, + "grad_norm": 0.061963410624624696, + "language_loss": 0.80203068, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81269693, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.28540039, + "step": 5073, + "time_per_iteration": 2.589348316192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062771, + "balance_loss_mlp": 1.03513861, + "epoch": 0.9761446710273182, + "flos": 583452301824.0, + "grad_norm": 0.07055046629147509, + "language_loss": 0.81962037, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83024812, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.27636719, + "step": 5074, + "time_per_iteration": 2.688120126724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_mlp": 1.03256583, + "epoch": 0.9763370527125818, + "flos": 481967262720.0, + "grad_norm": 0.0708195540805075, + "language_loss": 0.82147515, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.83208144, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.28051758, + "step": 5075, + "time_per_iteration": 2.5922911167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065842, + "balance_loss_mlp": 1.03761315, + "epoch": 0.9765294343978453, + "flos": 618706543104.0, + "grad_norm": 0.050628843291049455, + "language_loss": 0.78722847, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79788685, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.28222656, + "step": 5076, + "time_per_iteration": 2.792860746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106741, + "balance_loss_mlp": 1.03901386, + "epoch": 0.9767218160831089, + "flos": 526320958464.0, + "grad_norm": 0.055930544938087315, + "language_loss": 0.84628701, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.85696107, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.28417969, + "step": 5077, + "time_per_iteration": 2.5855977535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061522, + "balance_loss_mlp": 1.03350818, + "epoch": 0.9769141977683724, + "flos": 524932890624.0, + "grad_norm": 0.06238733542479722, + "language_loss": 0.83731985, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.84793508, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.28027344, + "step": 5078, + "time_per_iteration": 2.65868878364563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068059, + "balance_loss_mlp": 1.03925812, + "epoch": 0.977106579453636, + "flos": 457359105024.0, + "grad_norm": 0.06822539995554136, + "language_loss": 0.80723315, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81791377, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.28759766, + "step": 5079, + "time_per_iteration": 2.923542022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062951, + "balance_loss_mlp": 1.03467441, + "epoch": 0.9772989611388996, + "flos": 531830969856.0, + "grad_norm": 0.06212480282272471, + "language_loss": 0.8100605, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82069004, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.28271484, + "step": 5080, + "time_per_iteration": 2.5942180156707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064744, + "balance_loss_mlp": 1.03665841, + "epoch": 0.9774913428241632, + "flos": 754999100928.0, + "grad_norm": 0.06018369502974455, + "language_loss": 0.85829055, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.86893803, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.28076172, + "step": 5081, + "time_per_iteration": 3.0458836555480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007999, + "balance_loss_mlp": 0.99684066, + "epoch": 0.9776837245094268, + "flos": 1553486003712.0, + "grad_norm": 0.004489173292933679, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79903495, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.11181641, + "step": 5082, + "time_per_iteration": 4.958382844924927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_mlp": 1.0406152, + "epoch": 0.9778761061946902, + "flos": 592260774912.0, + "grad_norm": 0.07104268119711042, + "language_loss": 0.83867311, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.84935224, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.2734375, + "step": 5083, + "time_per_iteration": 2.6874279975891113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064965, + "balance_loss_mlp": 1.03678381, + "epoch": 0.9780684878799538, + "flos": 414732510720.0, + "grad_norm": 0.05557596006136318, + "language_loss": 0.81895953, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.82960916, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.28173828, + "step": 5084, + "time_per_iteration": 2.479989767074585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064918, + "balance_loss_mlp": 1.03676116, + "epoch": 0.9782608695652174, + "flos": 568129863168.0, + "grad_norm": 0.06367156527837714, + "language_loss": 0.84807253, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.85872167, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.28173828, + "step": 5085, + "time_per_iteration": 2.7060656547546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070809, + "balance_loss_mlp": 1.04272389, + "epoch": 0.978453251250481, + "flos": 690151988736.0, + "grad_norm": 0.05907609333285634, + "language_loss": 0.82935727, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84006536, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.28076172, + "step": 5086, + "time_per_iteration": 2.8707125186920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063057, + "balance_loss_mlp": 1.03611541, + "epoch": 0.9786456329357445, + "flos": 502239508992.0, + "grad_norm": 0.05371323925728747, + "language_loss": 0.77593768, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.78656816, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.26977539, + "step": 5087, + "time_per_iteration": 2.6420705318450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063721, + "balance_loss_mlp": 1.03577852, + "epoch": 0.9788380146210081, + "flos": 862829369856.0, + "grad_norm": 0.06417447661678208, + "language_loss": 0.8063373, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.81697452, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.27978516, + "step": 5088, + "time_per_iteration": 3.069293737411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106397, + "balance_loss_mlp": 1.03669453, + "epoch": 0.9790303963062716, + "flos": 512460781056.0, + "grad_norm": 0.06576556043594046, + "language_loss": 0.8408463, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85148597, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.27319336, + "step": 5089, + "time_per_iteration": 2.5954997539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_mlp": 1.03673351, + "epoch": 0.9792227779915352, + "flos": 494183296512.0, + "grad_norm": 0.06743703328576649, + "language_loss": 0.86218363, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87282586, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.27539062, + "step": 5090, + "time_per_iteration": 2.5840415954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_mlp": 1.03685737, + "epoch": 0.9794151596767988, + "flos": 608037138432.0, + "grad_norm": 0.06524237500562691, + "language_loss": 0.81397247, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82461935, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.27856445, + "step": 5091, + "time_per_iteration": 2.840353012084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064016, + "balance_loss_mlp": 1.0356915, + "epoch": 0.9796075413620623, + "flos": 477979149312.0, + "grad_norm": 0.062315389345704714, + "language_loss": 0.86601949, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87665963, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.28320312, + "step": 5092, + "time_per_iteration": 2.6200978755950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066682, + "balance_loss_mlp": 1.03800082, + "epoch": 0.9797999230473259, + "flos": 591936297984.0, + "grad_norm": 0.06641731956014876, + "language_loss": 0.84266961, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85333645, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.28686523, + "step": 5093, + "time_per_iteration": 2.746304750442505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064074, + "balance_loss_mlp": 1.03651321, + "epoch": 0.9799923047325895, + "flos": 556086947328.0, + "grad_norm": 0.05417277505387902, + "language_loss": 0.81640154, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82704222, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.27612305, + "step": 5094, + "time_per_iteration": 2.923612356185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066797, + "balance_loss_mlp": 1.03766191, + "epoch": 0.9801846864178531, + "flos": 579164442624.0, + "grad_norm": 0.051233862308683015, + "language_loss": 0.84678006, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85744798, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.29101562, + "step": 5095, + "time_per_iteration": 2.808309316635132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066717, + "balance_loss_mlp": 1.03941762, + "epoch": 0.9803770681031165, + "flos": 514825099776.0, + "grad_norm": 0.07168178318211277, + "language_loss": 0.79702234, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80768943, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.2734375, + "step": 5096, + "time_per_iteration": 2.6654510498046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064751, + "balance_loss_mlp": 1.03690398, + "epoch": 0.9805694497883801, + "flos": 566706889728.0, + "grad_norm": 0.0522945877997543, + "language_loss": 0.78104866, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79169619, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.27880859, + "step": 5097, + "time_per_iteration": 2.7318952083587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106322, + "balance_loss_mlp": 1.03561127, + "epoch": 0.9807618314736437, + "flos": 479106759168.0, + "grad_norm": 0.05974567880983708, + "language_loss": 0.73509181, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74572396, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.27636719, + "step": 5098, + "time_per_iteration": 2.634428024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066119, + "balance_loss_mlp": 1.03765178, + "epoch": 0.9809542131589073, + "flos": 545021844480.0, + "grad_norm": 0.0637255746549638, + "language_loss": 0.80092281, + "learning_rate": 9.509698444908344e-07, + "loss": 0.811584, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.28466797, + "step": 5099, + "time_per_iteration": 2.6950488090515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106464, + "balance_loss_mlp": 1.0366019, + "epoch": 0.9811465948441709, + "flos": 520589776896.0, + "grad_norm": 0.07183511235342367, + "language_loss": 0.79666537, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80731177, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.28076172, + "step": 5100, + "time_per_iteration": 2.612643003463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062328, + "balance_loss_mlp": 1.03431392, + "epoch": 0.9813389765294344, + "flos": 541023556608.0, + "grad_norm": 0.05365704847096246, + "language_loss": 0.79934072, + "learning_rate": 9.129465107554635e-07, + "loss": 0.80996406, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.28051758, + "step": 5101, + "time_per_iteration": 2.675701856613159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063332, + "balance_loss_mlp": 1.03608108, + "epoch": 0.981531358214698, + "flos": 567080828928.0, + "grad_norm": 0.053968731352124745, + "language_loss": 0.84537339, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85600674, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.27294922, + "step": 5102, + "time_per_iteration": 2.696805000305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068977, + "balance_loss_mlp": 1.04117751, + "epoch": 0.9817237398999615, + "flos": 576987798528.0, + "grad_norm": 0.04658045278323515, + "language_loss": 0.81048197, + "learning_rate": 8.756982280578307e-07, + "loss": 0.82117176, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.27832031, + "step": 5103, + "time_per_iteration": 2.717839241027832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063782, + "balance_loss_mlp": 1.03536224, + "epoch": 0.9819161215852251, + "flos": 701172011520.0, + "grad_norm": 0.05020668582678838, + "language_loss": 0.81720734, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82784516, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.28417969, + "step": 5104, + "time_per_iteration": 2.9835586547851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068728, + "balance_loss_mlp": 1.04111898, + "epoch": 0.9821085032704886, + "flos": 623873138688.0, + "grad_norm": 0.056311692471421905, + "language_loss": 0.84119457, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85188186, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.27636719, + "step": 5105, + "time_per_iteration": 2.865739345550537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066478, + "balance_loss_mlp": 1.03872645, + "epoch": 0.9823008849557522, + "flos": 499259731968.0, + "grad_norm": 0.0633138190007986, + "language_loss": 0.81195086, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82261562, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.27807617, + "step": 5106, + "time_per_iteration": 2.659518241882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066087, + "balance_loss_mlp": 1.03788257, + "epoch": 0.9824932666410158, + "flos": 523544822784.0, + "grad_norm": 0.07527269875681585, + "language_loss": 0.72561419, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73627502, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.28173828, + "step": 5107, + "time_per_iteration": 2.6718273162841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064766, + "balance_loss_mlp": 1.03632259, + "epoch": 0.9826856483262794, + "flos": 502411216896.0, + "grad_norm": 0.05421046095674237, + "language_loss": 0.8271212, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83776885, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.28442383, + "step": 5108, + "time_per_iteration": 2.67730712890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068766, + "balance_loss_mlp": 1.04068017, + "epoch": 0.982878030011543, + "flos": 561768666624.0, + "grad_norm": 0.05263815336663701, + "language_loss": 0.84345829, + "learning_rate": 7.686042586151354e-07, + "loss": 0.854146, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.28125, + "step": 5109, + "time_per_iteration": 2.8377928733825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064493, + "balance_loss_mlp": 1.03690767, + "epoch": 0.9830704116968064, + "flos": 536824447488.0, + "grad_norm": 0.05247784776124124, + "language_loss": 0.827075, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83771992, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.27612305, + "step": 5110, + "time_per_iteration": 2.7975401878356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065466, + "balance_loss_mlp": 1.03649783, + "epoch": 0.98326279338207, + "flos": 458712267264.0, + "grad_norm": 0.09015714109116883, + "language_loss": 0.83963883, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85029346, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.28955078, + "step": 5111, + "time_per_iteration": 2.629821538925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064126, + "balance_loss_mlp": 1.03639817, + "epoch": 0.9834551750673336, + "flos": 640672395264.0, + "grad_norm": 0.0626478349182541, + "language_loss": 0.79414022, + "learning_rate": 7.17673735218416e-07, + "loss": 0.8047815, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.27758789, + "step": 5112, + "time_per_iteration": 2.8147006034851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066315, + "balance_loss_mlp": 1.03918338, + "epoch": 0.9836475567525972, + "flos": 1071373478400.0, + "grad_norm": 0.05679930986703107, + "language_loss": 0.79416007, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80482322, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.27172852, + "step": 5113, + "time_per_iteration": 3.4686007499694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066539, + "balance_loss_mlp": 1.03871512, + "epoch": 0.9838399384378607, + "flos": 564943472640.0, + "grad_norm": 0.0695922230285022, + "language_loss": 0.76262808, + "learning_rate": 6.846892349181566e-07, + "loss": 0.7732935, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.27832031, + "step": 5114, + "time_per_iteration": 2.670605421066284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067777, + "balance_loss_mlp": 1.03995383, + "epoch": 0.9840323201231242, + "flos": 772463278080.0, + "grad_norm": 0.060418718595467394, + "language_loss": 0.79443765, + "learning_rate": 6.684877586787819e-07, + "loss": 0.80511546, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.27880859, + "step": 5115, + "time_per_iteration": 2.9948134422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065258, + "balance_loss_mlp": 1.03714883, + "epoch": 0.9842247018083878, + "flos": 472016623104.0, + "grad_norm": 0.06012665719169644, + "language_loss": 0.85382408, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86447668, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.28125, + "step": 5116, + "time_per_iteration": 2.572911262512207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068184, + "balance_loss_mlp": 1.03983617, + "epoch": 0.9844170834936514, + "flos": 524996909568.0, + "grad_norm": 0.05269181316920123, + "language_loss": 0.8446027, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85528451, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.28369141, + "step": 5117, + "time_per_iteration": 2.621640682220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007971, + "balance_loss_mlp": 0.99681312, + "epoch": 0.984609465178915, + "flos": 1566406245888.0, + "grad_norm": 0.00448658618358924, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.7817049, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.11181641, + "step": 5118, + "time_per_iteration": 4.9339916706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068461, + "balance_loss_mlp": 1.04085207, + "epoch": 0.9848018468641785, + "flos": 519294841344.0, + "grad_norm": 0.07090239298528411, + "language_loss": 0.81994283, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83062744, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.27661133, + "step": 5119, + "time_per_iteration": 2.609553337097168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066341, + "balance_loss_mlp": 1.0383265, + "epoch": 0.9849942285494421, + "flos": 492760323072.0, + "grad_norm": 0.06400427825607695, + "language_loss": 0.83007431, + "learning_rate": 5.903883659301167e-07, + "loss": 0.8407377, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.28027344, + "step": 5120, + "time_per_iteration": 2.5743188858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062999, + "balance_loss_mlp": 1.03477073, + "epoch": 0.9851866102347057, + "flos": 545740609536.0, + "grad_norm": 0.07031157312266538, + "language_loss": 0.80597335, + "learning_rate": 5.753501275193029e-07, + "loss": 0.81660336, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.2824707, + "step": 5121, + "time_per_iteration": 2.6467745304107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064653, + "balance_loss_mlp": 1.03656745, + "epoch": 0.9853789919199692, + "flos": 476019293184.0, + "grad_norm": 0.08630519258977648, + "language_loss": 0.80286318, + "learning_rate": 5.605057829531912e-07, + "loss": 0.8135097, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.28100586, + "step": 5122, + "time_per_iteration": 2.5414161682128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.03586268, + "epoch": 0.9855713736052328, + "flos": 1032199524864.0, + "grad_norm": 0.07104979661639406, + "language_loss": 0.75887775, + "learning_rate": 5.458553379950049e-07, + "loss": 0.76951861, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.2824707, + "step": 5123, + "time_per_iteration": 3.3723208904266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068718, + "balance_loss_mlp": 1.04125214, + "epoch": 0.9857637552904963, + "flos": 494794372608.0, + "grad_norm": 0.05613599300487702, + "language_loss": 0.82546532, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83615249, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.27490234, + "step": 5124, + "time_per_iteration": 2.6383025646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067982, + "balance_loss_mlp": 1.03984904, + "epoch": 0.9859561369757599, + "flos": 591990142464.0, + "grad_norm": 0.06759103008670121, + "language_loss": 0.83886242, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84954226, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.28149414, + "step": 5125, + "time_per_iteration": 2.72212553024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.0372963, + "epoch": 0.9861485186610235, + "flos": 486719221248.0, + "grad_norm": 0.06666021262428576, + "language_loss": 0.78677505, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79741907, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.27148438, + "step": 5126, + "time_per_iteration": 2.6846718788146973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066171, + "balance_loss_mlp": 1.03858638, + "epoch": 0.9863409003462871, + "flos": 518536788480.0, + "grad_norm": 0.058638408725860694, + "language_loss": 0.82465839, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83532012, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.27661133, + "step": 5127, + "time_per_iteration": 2.7298150062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008067, + "balance_loss_mlp": 0.99681342, + "epoch": 0.9865332820315506, + "flos": 1485212391936.0, + "grad_norm": 0.004487140552061121, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80190802, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.11230469, + "step": 5128, + "time_per_iteration": 4.89987587928772 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066434, + "balance_loss_mlp": 1.03872991, + "epoch": 0.9867256637168141, + "flos": 581837271552.0, + "grad_norm": 0.05732406131486392, + "language_loss": 0.78990746, + "learning_rate": 4.620248732582488e-07, + "loss": 0.8005718, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.27734375, + "step": 5129, + "time_per_iteration": 2.705324649810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066403, + "balance_loss_mlp": 1.03869843, + "epoch": 0.9869180454020777, + "flos": 958898939904.0, + "grad_norm": 0.05618301860540118, + "language_loss": 0.86019075, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87085474, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.27758789, + "step": 5130, + "time_per_iteration": 3.2497148513793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068506, + "balance_loss_mlp": 1.04037285, + "epoch": 0.9871104270873413, + "flos": 770385558528.0, + "grad_norm": 0.06199778445898079, + "language_loss": 0.82707268, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.83775777, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.28173828, + "step": 5131, + "time_per_iteration": 2.9684877395629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_mlp": 1.03326333, + "epoch": 0.9873028087726049, + "flos": 446215426560.0, + "grad_norm": 0.08982074856332944, + "language_loss": 0.77832627, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.78894454, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.28564453, + "step": 5132, + "time_per_iteration": 2.482541084289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064048, + "balance_loss_mlp": 1.03629649, + "epoch": 0.9874951904578684, + "flos": 507359614464.0, + "grad_norm": 0.06533508581456446, + "language_loss": 0.86547804, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87611854, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.27758789, + "step": 5133, + "time_per_iteration": 2.606316089630127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106598, + "balance_loss_mlp": 1.03815663, + "epoch": 0.987687572143132, + "flos": 716420256768.0, + "grad_norm": 0.05462139219845756, + "language_loss": 0.82088351, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83154333, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.27832031, + "step": 5134, + "time_per_iteration": 2.9544055461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007967, + "balance_loss_mlp": 0.99680901, + "epoch": 0.9878799538283956, + "flos": 1537823121408.0, + "grad_norm": 0.004485925131120654, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80825919, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.11181641, + "step": 5135, + "time_per_iteration": 4.916072368621826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071371, + "balance_loss_mlp": 1.04330945, + "epoch": 0.9880723355136591, + "flos": 721098021888.0, + "grad_norm": 0.05079199609455142, + "language_loss": 0.81718385, + "learning_rate": 3.730469030412964e-07, + "loss": 0.82789761, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.28076172, + "step": 5136, + "time_per_iteration": 2.918941020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064402, + "balance_loss_mlp": 1.03743672, + "epoch": 0.9882647171989226, + "flos": 557085109248.0, + "grad_norm": 0.04769574235406856, + "language_loss": 0.84221953, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85286361, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.27001953, + "step": 5137, + "time_per_iteration": 2.659917116165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069145, + "balance_loss_mlp": 1.04108286, + "epoch": 0.9884570988841862, + "flos": 562541276160.0, + "grad_norm": 0.06700563698780587, + "language_loss": 0.80401492, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81470633, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.28076172, + "step": 5138, + "time_per_iteration": 2.7219605445861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.03707492, + "epoch": 0.9886494805694498, + "flos": 431537559552.0, + "grad_norm": 0.05937359119861329, + "language_loss": 0.861534, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87219155, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.28662109, + "step": 5139, + "time_per_iteration": 2.475071907043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066242, + "balance_loss_mlp": 1.03777456, + "epoch": 0.9888418622547134, + "flos": 591793703424.0, + "grad_norm": 0.06601069931668228, + "language_loss": 0.90451717, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91517955, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.28442383, + "step": 5140, + "time_per_iteration": 2.7885544300079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068428, + "balance_loss_mlp": 1.04008031, + "epoch": 0.989034243939977, + "flos": 1134526984704.0, + "grad_norm": 0.05211766509967625, + "language_loss": 0.79793286, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.80861717, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.28369141, + "step": 5141, + "time_per_iteration": 3.5274829864501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068155, + "balance_loss_mlp": 1.03997421, + "epoch": 0.9892266256252404, + "flos": 566405733888.0, + "grad_norm": 0.06930719912471439, + "language_loss": 0.82036865, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83105016, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.28173828, + "step": 5142, + "time_per_iteration": 2.703993320465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066028, + "balance_loss_mlp": 1.03787088, + "epoch": 0.989419007310504, + "flos": 640254786048.0, + "grad_norm": 0.055317623782820756, + "language_loss": 0.83511734, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.84577763, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.28198242, + "step": 5143, + "time_per_iteration": 2.893228530883789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066108, + "balance_loss_mlp": 1.03823721, + "epoch": 0.9896113889957676, + "flos": 455236305408.0, + "grad_norm": 0.08393848861396483, + "language_loss": 0.81569672, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82635784, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.27929688, + "step": 5144, + "time_per_iteration": 2.6225786209106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062804, + "balance_loss_mlp": 1.03600597, + "epoch": 0.9898037706810312, + "flos": 567070654464.0, + "grad_norm": 0.06173682560666289, + "language_loss": 0.80544829, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81607634, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.26843262, + "step": 5145, + "time_per_iteration": 2.654784917831421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007962, + "balance_loss_mlp": 0.99680388, + "epoch": 0.9899961523662947, + "flos": 1549476131328.0, + "grad_norm": 0.004486567151540307, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79154384, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.11181641, + "step": 5146, + "time_per_iteration": 4.91846489906311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069812, + "balance_loss_mlp": 1.04148769, + "epoch": 0.9901885340515583, + "flos": 610401457152.0, + "grad_norm": 0.05716126832378814, + "language_loss": 0.85056078, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86125898, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.28344727, + "step": 5147, + "time_per_iteration": 2.931447982788086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065851, + "balance_loss_mlp": 1.03790796, + "epoch": 0.9903809157368219, + "flos": 517231678464.0, + "grad_norm": 0.0532065355074075, + "language_loss": 0.83003807, + "learning_rate": 2.426269020866512e-07, + "loss": 0.84069657, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.27978516, + "step": 5148, + "time_per_iteration": 2.582853317260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067847, + "balance_loss_mlp": 1.04069078, + "epoch": 0.9905732974220854, + "flos": 1099985716224.0, + "grad_norm": 0.061881592272325446, + "language_loss": 0.8030684, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81374693, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.27197266, + "step": 5149, + "time_per_iteration": 3.4220290184020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106185, + "balance_loss_mlp": 1.03354931, + "epoch": 0.990765679107349, + "flos": 857630688768.0, + "grad_norm": 0.06728374369522626, + "language_loss": 0.84112859, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.8517471, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.28271484, + "step": 5150, + "time_per_iteration": 3.148772716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066189, + "balance_loss_mlp": 1.03838968, + "epoch": 0.9909580607926125, + "flos": 491041986048.0, + "grad_norm": 0.07292631958649022, + "language_loss": 0.79760653, + "learning_rate": 2.143871490925542e-07, + "loss": 0.80826843, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.27856445, + "step": 5151, + "time_per_iteration": 2.616525888442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062289, + "balance_loss_mlp": 1.03401303, + "epoch": 0.9911504424778761, + "flos": 584786525184.0, + "grad_norm": 0.054098688428558285, + "language_loss": 0.79339308, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80401593, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.28271484, + "step": 5152, + "time_per_iteration": 2.697601318359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063445, + "balance_loss_mlp": 1.03431082, + "epoch": 0.9913428241631397, + "flos": 569763832320.0, + "grad_norm": 0.05798999078782896, + "language_loss": 0.81267428, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.82330877, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.29125977, + "step": 5153, + "time_per_iteration": 2.695401430130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069116, + "balance_loss_mlp": 1.04124546, + "epoch": 0.9915352058484033, + "flos": 489505531392.0, + "grad_norm": 0.05056334219694486, + "language_loss": 0.86464977, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.87534094, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.27880859, + "step": 5154, + "time_per_iteration": 2.605447292327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.03827786, + "epoch": 0.9917275875336667, + "flos": 743708445696.0, + "grad_norm": 0.05862021077509536, + "language_loss": 0.82721972, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83788049, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.27832031, + "step": 5155, + "time_per_iteration": 2.9818990230560303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063222, + "balance_loss_mlp": 1.03580356, + "epoch": 0.9919199692189303, + "flos": 508009978368.0, + "grad_norm": 0.06232653880915019, + "language_loss": 0.80101055, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81164277, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.27441406, + "step": 5156, + "time_per_iteration": 2.7161529064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065636, + "balance_loss_mlp": 1.03790784, + "epoch": 0.9921123509041939, + "flos": 543702177792.0, + "grad_norm": 0.05760646667889777, + "language_loss": 0.83967817, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85033458, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.27758789, + "step": 5157, + "time_per_iteration": 2.682803153991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067348, + "balance_loss_mlp": 1.03947723, + "epoch": 0.9923047325894575, + "flos": 671263428096.0, + "grad_norm": 0.06367529404568006, + "language_loss": 0.77228302, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78295648, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.27905273, + "step": 5158, + "time_per_iteration": 4.246589660644531 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065306, + "balance_loss_mlp": 1.03667164, + "epoch": 0.9924971142747211, + "flos": 466291233792.0, + "grad_norm": 0.05221936128324597, + "language_loss": 0.80749053, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81814361, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.28637695, + "step": 5159, + "time_per_iteration": 2.6864054203033447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067543, + "balance_loss_mlp": 1.03955257, + "epoch": 0.9926894959599846, + "flos": 491337349632.0, + "grad_norm": 0.05537471302604842, + "language_loss": 0.82565844, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83633387, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.2800293, + "step": 5160, + "time_per_iteration": 2.582225799560547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067776, + "balance_loss_mlp": 1.04052472, + "epoch": 0.9928818776452482, + "flos": 492144864768.0, + "grad_norm": 0.060411904501977205, + "language_loss": 0.81547213, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82614988, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.27294922, + "step": 5161, + "time_per_iteration": 2.583430290222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066178, + "balance_loss_mlp": 1.03759193, + "epoch": 0.9930742593305117, + "flos": 546081053184.0, + "grad_norm": 0.05578659951307412, + "language_loss": 0.84225255, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85291433, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.28588867, + "step": 5162, + "time_per_iteration": 2.784080743789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065962, + "balance_loss_mlp": 1.0385437, + "epoch": 0.9932666410157753, + "flos": 585234657792.0, + "grad_norm": 0.06485983604948299, + "language_loss": 0.85919869, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.86985826, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.27441406, + "step": 5163, + "time_per_iteration": 2.8364484310150146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068157, + "balance_loss_mlp": 1.04088211, + "epoch": 0.9934590227010388, + "flos": 536833211904.0, + "grad_norm": 0.05493620654926138, + "language_loss": 0.83572662, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84640813, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.27319336, + "step": 5164, + "time_per_iteration": 2.6437575817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067967, + "balance_loss_mlp": 1.03973818, + "epoch": 0.9936514043863024, + "flos": 517754004480.0, + "grad_norm": 0.056425868204336455, + "language_loss": 0.86519146, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87587112, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.2824707, + "step": 5165, + "time_per_iteration": 2.6676025390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065235, + "balance_loss_mlp": 1.0375309, + "epoch": 0.993843786071566, + "flos": 744284616192.0, + "grad_norm": 0.0490013156900056, + "language_loss": 0.80073357, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81138593, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.27734375, + "step": 5166, + "time_per_iteration": 3.0390608310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106152, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9940361677568296, + "flos": 525647273472.0, + "grad_norm": 0.06372482687070874, + "language_loss": 0.81947267, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83008784, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.27441406, + "step": 5167, + "time_per_iteration": 2.7436130046844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106557, + "balance_loss_mlp": 1.03769922, + "epoch": 0.9942285494420932, + "flos": 555376946688.0, + "grad_norm": 0.058129004246180074, + "language_loss": 0.79914057, + "learning_rate": 8.735020633177104e-08, + "loss": 0.80979621, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.27880859, + "step": 5168, + "time_per_iteration": 2.7534189224243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061589, + "balance_loss_mlp": 1.03426659, + "epoch": 0.9944209311273566, + "flos": 585722078208.0, + "grad_norm": 0.055875518940578246, + "language_loss": 0.82051367, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83112955, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.2734375, + "step": 5169, + "time_per_iteration": 2.6998350620269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.03873193, + "epoch": 0.9946133128126202, + "flos": 735185161728.0, + "grad_norm": 0.0553680742338806, + "language_loss": 0.81735945, + "learning_rate": 7.609202086272804e-08, + "loss": 0.82802856, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.28173828, + "step": 5170, + "time_per_iteration": 2.974087953567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067505, + "balance_loss_mlp": 1.04006255, + "epoch": 0.9948056944978838, + "flos": 645728481792.0, + "grad_norm": 0.05803029199457052, + "language_loss": 0.82077813, + "learning_rate": 7.075405856526995e-08, + "loss": 0.8314532, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.2746582, + "step": 5171, + "time_per_iteration": 2.802490711212158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063573, + "balance_loss_mlp": 1.03567767, + "epoch": 0.9949980761831474, + "flos": 445610142720.0, + "grad_norm": 0.051666066315458954, + "language_loss": 0.8596555, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87029123, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.27929688, + "step": 5172, + "time_per_iteration": 2.5076162815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065596, + "balance_loss_mlp": 1.03736687, + "epoch": 0.995190457868411, + "flos": 435407809536.0, + "grad_norm": 0.05785345995526832, + "language_loss": 0.85552263, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86617857, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.28198242, + "step": 5173, + "time_per_iteration": 2.560850143432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065028, + "balance_loss_mlp": 1.03730011, + "epoch": 0.9953828395536745, + "flos": 513937598976.0, + "grad_norm": 0.06879105592605711, + "language_loss": 0.81177318, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82242346, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.27783203, + "step": 5174, + "time_per_iteration": 2.5707099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068403, + "balance_loss_mlp": 1.04010248, + "epoch": 0.995575221238938, + "flos": 479608736256.0, + "grad_norm": 0.05793677097308627, + "language_loss": 0.81645823, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82714224, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.28295898, + "step": 5175, + "time_per_iteration": 2.5351579189300537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066274, + "balance_loss_mlp": 1.03816414, + "epoch": 0.9957676029242016, + "flos": 587500051968.0, + "grad_norm": 0.06414761636339157, + "language_loss": 0.76212519, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77278793, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.28125, + "step": 5176, + "time_per_iteration": 2.732705593109131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067516, + "balance_loss_mlp": 1.03833389, + "epoch": 0.9959599846094652, + "flos": 426244336128.0, + "grad_norm": 0.06423852980167365, + "language_loss": 0.80285561, + "learning_rate": 4.280223671243588e-08, + "loss": 0.8135308, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.29150391, + "step": 5177, + "time_per_iteration": 2.4773876667022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_mlp": 1.03843391, + "epoch": 0.9961523662947287, + "flos": 611312279040.0, + "grad_norm": 0.06073091502053508, + "language_loss": 0.80718446, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81785315, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.28442383, + "step": 5178, + "time_per_iteration": 2.8103115558624268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106861, + "balance_loss_mlp": 1.04052424, + "epoch": 0.9963447479799923, + "flos": 550521681408.0, + "grad_norm": 0.06916799430968669, + "language_loss": 0.73766887, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74835497, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.28076172, + "step": 5179, + "time_per_iteration": 2.703822135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_mlp": 1.03933787, + "epoch": 0.9965371296652559, + "flos": 625590065664.0, + "grad_norm": 0.06211677936352167, + "language_loss": 0.8883329, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89901328, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.28710938, + "step": 5180, + "time_per_iteration": 2.718750238418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067675, + "balance_loss_mlp": 1.03987575, + "epoch": 0.9967295113505195, + "flos": 639205751808.0, + "grad_norm": 0.0597666169713144, + "language_loss": 0.81612909, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82680583, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.27807617, + "step": 5181, + "time_per_iteration": 2.8592679500579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067652, + "balance_loss_mlp": 1.03949463, + "epoch": 0.996921893035783, + "flos": 607101585408.0, + "grad_norm": 0.06266480255691725, + "language_loss": 0.76737624, + "learning_rate": 2.484679859793282e-08, + "loss": 0.77805281, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.28149414, + "step": 5182, + "time_per_iteration": 2.7209362983703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066053, + "balance_loss_mlp": 1.03768075, + "epoch": 0.9971142747210465, + "flos": 643867550208.0, + "grad_norm": 0.06216190234164215, + "language_loss": 0.81831425, + "learning_rate": 2.183802848243488e-08, + "loss": 0.82897472, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.28393555, + "step": 5183, + "time_per_iteration": 2.774146795272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064034, + "balance_loss_mlp": 1.03599548, + "epoch": 0.9973066564063101, + "flos": 1040353251840.0, + "grad_norm": 0.06556543117014918, + "language_loss": 0.81132638, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.82196677, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.28076172, + "step": 5184, + "time_per_iteration": 3.3546173572540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.04109859, + "epoch": 0.9974990380915737, + "flos": 664784368128.0, + "grad_norm": 0.05954721572785247, + "language_loss": 0.82821018, + "learning_rate": 1.640281555587153e-08, + "loss": 0.83889651, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.27539062, + "step": 5185, + "time_per_iteration": 2.8433001041412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106356, + "balance_loss_mlp": 1.03559387, + "epoch": 0.9976914197768373, + "flos": 717808324608.0, + "grad_norm": 0.06389741277259232, + "language_loss": 0.77631515, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78695071, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.2800293, + "step": 5186, + "time_per_iteration": 2.853642225265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106937, + "balance_loss_mlp": 1.04040217, + "epoch": 0.9978838014621008, + "flos": 518078481408.0, + "grad_norm": 0.06367119653517782, + "language_loss": 0.78993869, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.80063242, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.28979492, + "step": 5187, + "time_per_iteration": 2.597912073135376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063481, + "balance_loss_mlp": 1.03594351, + "epoch": 0.9980761831473643, + "flos": 603138203136.0, + "grad_norm": 0.05419942237508798, + "language_loss": 0.84304327, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85367805, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.27587891, + "step": 5188, + "time_per_iteration": 2.840768337249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065682, + "balance_loss_mlp": 1.03759623, + "epoch": 0.9982685648326279, + "flos": 453291005952.0, + "grad_norm": 0.06002889752243125, + "language_loss": 0.89368796, + "learning_rate": 7.861726879943021e-09, + "loss": 0.9043448, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.28125, + "step": 5189, + "time_per_iteration": 2.552727222442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062986, + "balance_loss_mlp": 1.03518605, + "epoch": 0.9984609465178915, + "flos": 481165539840.0, + "grad_norm": 0.06316155939206874, + "language_loss": 0.7862134, + "learning_rate": 6.211738235173403e-09, + "loss": 0.79684329, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.2779541, + "step": 5190, + "time_per_iteration": 2.6379244327545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066899, + "balance_loss_mlp": 1.03917098, + "epoch": 0.9986533282031551, + "flos": 476675449344.0, + "grad_norm": 0.05665801971866078, + "language_loss": 0.83808017, + "learning_rate": 4.755864394301312e-09, + "loss": 0.84874916, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.27734375, + "step": 5191, + "time_per_iteration": 2.6630475521087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069637, + "balance_loss_mlp": 1.04109824, + "epoch": 0.9988457098884186, + "flos": 641647236096.0, + "grad_norm": 0.060865154735589906, + "language_loss": 0.86545348, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87614989, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.28540039, + "step": 5192, + "time_per_iteration": 2.8011183738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064989, + "balance_loss_mlp": 1.03661728, + "epoch": 0.9990380915736822, + "flos": 396105818112.0, + "grad_norm": 0.06899373764772466, + "language_loss": 0.88023686, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.89088672, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.28417969, + "step": 5193, + "time_per_iteration": 2.444704532623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063986, + "balance_loss_mlp": 1.0359, + "epoch": 0.9992304732589458, + "flos": 575831075328.0, + "grad_norm": 0.05722056537854718, + "language_loss": 0.84702891, + "learning_rate": 1.552936970405927e-09, + "loss": 0.85766876, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.28100586, + "step": 5194, + "time_per_iteration": 2.7448079586029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_mlp": 1.03680563, + "epoch": 0.9994228549442093, + "flos": 544017890304.0, + "grad_norm": 0.06097238370641317, + "language_loss": 0.75467938, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76532328, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.27636719, + "step": 5195, + "time_per_iteration": 2.657932996749878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065274, + "balance_loss_mlp": 1.03723598, + "epoch": 0.9996152366294728, + "flos": 1470777910272.0, + "grad_norm": 0.07343124053049398, + "language_loss": 0.80373323, + "learning_rate": 3.882343933003796e-10, + "loss": 0.81438601, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.28051758, + "step": 5196, + "time_per_iteration": 3.7275376319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052349, + "balance_loss_mlp": 1.02769601, + "epoch": 0.9998076183147364, + "flos": 618667255296.0, + "grad_norm": 0.1051352637453268, + "language_loss": 0.69885886, + "learning_rate": 9.70586077619906e-11, + "loss": 0.70938236, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.24664307, + "step": 5197, + "time_per_iteration": 4.0517966747283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_mlp": 1.01130903, + "epoch": 1.0, + "flos": 1289959492608.0, + "grad_norm": 0.02771641462610759, + "language_loss": 0.84174764, + "learning_rate": 0.0, + "loss": 0.85203409, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.17364502, + "step": 5198, + "time_per_iteration": 5.574992895126343 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.1713320035811328e+16, + "train_loss": 0.8582443886419681, + "train_runtime": 15504.5984, + "train_samples_per_second": 42.91, + "train_steps_per_second": 0.335 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1713320035811328e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe_share/training_args.bin b/sft_pretrain/Full_smoe_share/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2c6286920da78be894d16b2c1ec77f899cd590e0 --- /dev/null +++ b/sft_pretrain/Full_smoe_share/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25bd416aaf59aaeb5c9268446dadaf85f4d00dfc3ac3dfec454141b47f814d1 +size 7992